QUDA  v1.1.0
A library for QCD on GPUs
transpose.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2013, NVIDIA Corporation
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7  * Redistributions of source code must retain the above copyright
8  notice, this list of conditions and the following disclaimer.
9  * Redistributions in binary form must reproduce the above copyright
10  notice, this list of conditions and the following disclaimer in the
11  documentation and/or other materials provided with the distribution.
12  * Neither the name of the <organization> nor the
13  names of its contributors may be used to endorse or promote products
14  derived from this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #pragma once
29 #include <trove/utility.h>
30 #include <trove/rotate.h>
31 #include <trove/shfl.h>
33 #include <trove/static_gcd.h>
34 #include <trove/warp.h>
35 
36 #define WARP_CONVERGED 0xffffffff
37 
38 namespace trove {
39 namespace detail {
40 
41 struct odd{};
42 struct power_of_two{};
43 struct composite{};
44 
45 template<int m, bool ispo2=is_power_of_two<m>::value, bool isodd=is_odd<m>::value>
46 struct tx_algorithm {
47  typedef composite type;
48 };
49 
50 template<int m>
51 struct tx_algorithm<m, true, false> {
52  typedef power_of_two type;
53 };
54 
55 template<int m>
56 struct tx_algorithm<m, false, true> {
57  typedef odd type;
58 };
59 
60 template<int m, typename Schema=typename tx_algorithm<m>::type>
62 
63 template<int m>
67  static const int permute = static_mod_inverse<rotate, m>::value;
68 };
69 
70 template<int m>
72  static const int offset = WARP_SIZE - WARP_SIZE/m;
73  static const int permute = m - 1;
74 };
75 
76 template<int m>
78  static const int c = static_gcd<m, WARP_SIZE>::value;
79  static const int k = static_mod_inverse<m/c, WARP_SIZE/c>::value;
80 };
81 
82 template<int m, typename Schema=typename tx_algorithm<m>::type>
84 
85 template<int m>
87  static const int permute = static_mod_inverse<WARP_SIZE, m>::value;
88 };
89 
90 template<int m>
92  c2r_offset_constants<m, power_of_two> {
93 };
94 
95 template<typename T, template<int> class Permute, int position=0>
96 struct tx_permute_impl{};
97 
98 template<typename T, int s, template<int> class Permute, int position>
99 struct tx_permute_impl<array<T, s>, Permute, position> {
101  static const int idx = Permute<position>::value;
102  template<typename Source>
103  __host__ __device__
104  static Remaining impl(const Source& src) {
105  return Remaining(
106  trove::get<idx>(src),
107  tx_permute_impl<array<T, s-1>, Permute, position+1>::impl(
108  src));
109  }
110 };
111 
112 template<typename T, template<int> class Permute, int position>
113 struct tx_permute_impl<array<T, 1>, Permute, position> {
115  static const int idx = Permute<position>::value;
116  template<typename Source>
117  __host__ __device__
118  static Remaining impl(const Source& src) {
119  return Remaining(trove::get<idx>(src));
120  }
121 };
122 
123 
124 template<int m, int a, int b=0>
126  template<int x>
127  struct eval {
128  static const int value = (a * x + b) % m;
129  };
130 };
131 
132 template<int m>
134  static const int o = WARP_SIZE % m;
135  static const int c = static_gcd<m, WARP_SIZE>::value;
136  static const int p = m / c;
137  template<int x>
138  struct eval {
139  static const int value = (x * o - (x / p)) % m;
140  };
141 };
142 
143 template<int m>
145  template<int x>
146  struct eval {
147  static const int value =
149  };
150 };
151 
152 
153 template<typename Array>
154 __host__ __device__ Array c2r_tx_permute(const Array& t) {
155  return tx_permute_impl<
156  Array,
157  affine_modular_fn<Array::size,
158  c2r_offset_constants<Array::size>::permute>::template eval>::impl(t);
159 }
160 
161 
162 
163 template<typename Array>
164 __host__ __device__ Array composite_c2r_tx_permute(const Array& t) {
165  return tx_permute_impl<
166  Array,
168 }
169 
170 template<typename Array>
171 __host__ __device__ Array composite_r2c_tx_permute(const Array& t) {
172  return tx_permute_impl<
173  Array,
175 }
176 
177 
178 
179 template<typename Array>
180 __host__ __device__ Array r2c_tx_permute(const Array& t) {
181  return tx_permute_impl<
182  Array,
183  affine_modular_fn<Array::size,
184  r2c_offset_constants<Array::size>::permute>::template eval>::impl(t);
185 }
186 
187 
188 template<typename Array, int b, int o>
190 
191 template<int s, int b, int o>
192 struct c2r_compute_offsets_impl<array<int, s>, b, o> {
194  __device__
195  static Array impl(int offset) {
196  if (offset >= b) {
197  offset -= b;
198  } //Poor man's x % b. Requires that o < b.
199  return Array(offset,
201  impl(offset + o));
202  }
203 };
204 
205 template<int b, int o>
206 struct c2r_compute_offsets_impl<array<int, 1>, b, o> {
208  __device__
209  static Array impl(int offset) {
210  if (offset >= b) {
211  offset -= b;
212  } //Poor man's x % b. Requires that o < b.
213  return Array(offset);
214  }
215 };
216 
217 template<int m, typename Schema>
219 
220 template<int m>
223  __device__ static int impl() {
224  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
225  int initial_offset = ((WARP_SIZE - warp_id) * constants::offset) & WARP_MASK;
226  return initial_offset;
227  }
228 };
229 
230 template<int m>
232  __device__ static int impl() {
233  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
234  int initial_offset = ((warp_id * (WARP_SIZE + 1)) >> static_log<m>::value) & WARP_MASK;
235  return initial_offset;
236  }
237 };
238 
239 template<int m, typename Schema>
241 
242 template<int m>
244  __device__ static int impl() {
245  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
246  int initial_offset = (warp_id * m) & WARP_MASK;
247  return initial_offset;
248  }
249 };
250 
251 
252 template<int m, typename Schema>
253 __device__
255  typedef c2r_offset_constants<m> constants;
256  int initial_offset = c2r_compute_initial_offset<m, Schema>::impl();
258  WARP_SIZE,
259  constants::offset>::impl(initial_offset);
260 }
261 
262 template<typename T, int m, int p = 0>
264 
265 template<int s, int m, int p>
266 struct c2r_compute_composite_offsets<array<int, s>, m, p> {
267  static const int n = WARP_SIZE;
268  static const int mod_n = n - 1;
269  static const int c = static_gcd<m, WARP_SIZE>::value;
270  static const int k = static_mod_inverse<m/c, n/c>::value;
271  static const int mod_c = c - 1;
272  static const int log_c = static_log<c>::value;
273  static const int n_div_c = n / c;
274  static const int mod_n_div_c = n_div_c - 1;
275  static const int log_n_div_c = static_log<n_div_c>::value;
277  __host__ __device__ static result_type impl(int idx, int col) {
278  int offset = ((((idx >> log_c) * k) & mod_n_div_c) +
279  ((idx & mod_c) << log_n_div_c)) & mod_n;
280  int new_idx = idx + n - 1;
281  new_idx = (p == m - c + (col & mod_c)) ? new_idx + m : new_idx;
282  return
283  result_type(offset,
285  ::impl(new_idx, col));
286 
287  }
288 };
289 
290 template<int m, int p>
291 struct c2r_compute_composite_offsets<array<int, 1>, m, p> {
292  static const int n = WARP_SIZE;
293  static const int mod_n = n - 1;
294  static const int c = static_gcd<m, WARP_SIZE>::value;
295  static const int k = static_mod_inverse<m/c, n/c>::value;
296  static const int mod_c = c - 1;
297  static const int log_c = static_log<c>::value;
298  static const int n_div_c = n / c;
299  static const int mod_n_div_c = n_div_c - 1;
300  static const int log_n_div_c = static_log<n_div_c>::value;
302  __host__ __device__ static result_type impl(int idx, int col) {
303  int offset = ((((idx >> log_c) * k) & mod_n_div_c) +
304  ((idx & mod_c) << log_n_div_c)) & mod_n;
305  return result_type(offset);
306 
307  }
308 };
309 
310 
311 template<int index, int offset, int bound>
312 struct r2c_offsets {
313  static const int value = (offset * index) % bound;
314 };
315 
316 template<typename Array, int index, int m, typename Schema>
318 
319 template<int s, int index, int m>
320 struct r2c_compute_offsets_impl<array<int, s>, index, m, odd> {
322  static const int offset = (WARP_SIZE % m * index) % m;
323  __device__
324  static Array impl(int initial_offset) {
325  int current_offset = (initial_offset + offset) & WARP_MASK;
326  return Array(current_offset,
328  index + 1, m, odd>::impl(initial_offset));
329  }
330 };
331 
332 template<int index, int m>
333 struct r2c_compute_offsets_impl<array<int, 1>, index, m, odd> {
335  static const int offset = (WARP_SIZE % m * index) % m;
336  __device__
337  static Array impl(int initial_offset) {
338  int current_offset = (initial_offset + offset) & WARP_MASK;
339  return Array(current_offset);
340  }
341 };
342 
343 
344 template<int s, int index, int m>
345 struct r2c_compute_offsets_impl<array<int, s>, index, m, power_of_two> {
347  __device__
348  static Array impl(int offset, int lb) {
349  int new_offset = (offset == lb) ? offset + m - 1 : offset - 1;
350  return Array(offset,
351  r2c_compute_offsets_impl<array<int, s-1>, index + 1, m, power_of_two>::impl(new_offset, lb));
352  }
353 };
354 
355 template<int index, int m>
356 struct r2c_compute_offsets_impl<array<int, 1>, index, m, power_of_two> {
358  __device__
359  static Array impl(int offset, int lb) {
360  return Array(offset);
361  }
362 };
363 
364 
365 template<typename T, int m>
367 
368 template<int s, int m>
370  static const int n = WARP_SIZE;
371  static const int mod_n = n - 1;
372  static const int c = static_gcd<m, WARP_SIZE>::value;
373  static const int n_div_c = n / c;
374  static const int log_n_div_c = static_log<n_div_c>::value;
376  __host__ __device__ static result_type impl(int col, int offset, int lb, int ub) {
377  int new_offset = offset + 1;
378  new_offset = (new_offset == ub) ? lb : new_offset;
379  return
380  result_type(offset & mod_n,
382  ::impl(col, new_offset, lb, ub));
383 
384  }
385 };
386 
387 template<int m>
389  static const int n = WARP_SIZE;
390  static const int mod_n = n - 1;
391  static const int c = static_gcd<m, WARP_SIZE>::value;
392  static const int n_div_c = n / c;
393  static const int log_n_div_c = static_log<n_div_c>::value;
395  __host__ __device__ static result_type impl(int col, int offset, int lb, int ub) {
396  return result_type(offset & mod_n);
397 
398  }
399 };
400 
401 template<int m, typename Schema>
402 __device__
404  typedef r2c_offset_constants<m> constants;
405  typedef array<int, m> result_type;
406  int initial_offset = r2c_compute_initial_offset<m, Schema>::impl();
407  return r2c_compute_offsets_impl<result_type,
408  0, m, Schema>::impl(initial_offset);
409 }
410 
411 
412 template<typename Data, typename Indices>
413 struct warp_shuffle {};
414 
415 template<typename T, int m>
416 struct warp_shuffle<array<T, m>, array<int, m> > {
417  __device__ static void impl(array<T, m>& d,
418  const array<int, m>& i) {
419 #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
420  d.head = __shfl_sync(WARP_CONVERGED, d.head, i.head);
421 #else
422  d.head = __shfl(d.head, i.head);
423 #endif
424  warp_shuffle<array<T, m-1>, array<int, m-1> >::impl(d.tail,
425  i.tail);
426  }
427 };
428 
429 template<typename T>
430 struct warp_shuffle<array<T, 1>, array<int, 1> > {
431  __device__ static void impl(array<T, 1>& d,
432  const array<int, 1>& i) {
433 #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
434  d.head = __shfl_sync(WARP_CONVERGED, d.head, i.head);
435 #else
436  d.head = __shfl(d.head, i.head);
437 #endif
438  }
439 };
440 
441 
442 template<typename Array, typename Schema>
444 
445 template<typename Array>
447  __device__ static void impl(Array& indices, int& rotation) {
448  indices = detail::c2r_compute_offsets<Array::size, odd>();
449  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
450  int size = Array::size;
452  rotation = (warp_id * r) % size;
453  }
454 };
455 
456 template<typename Array>
458  __device__ static void impl(Array& indices, int& rotation) {
459  indices = detail::c2r_compute_offsets<Array::size, power_of_two>();
460  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
461  int size = Array::size;
462  rotation = (size - warp_id) & (size - 1);
463  }
464 };
465 
466 template<typename Array>
468  __device__ static void impl(Array& indices, int& rotation) {
469  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
470 
472  rotation = warp_id % Array::size;
473  }
474 };
475 
476 template<typename Array, typename Indices, typename Schema>
478 
479 template<typename Array, typename Indices>
480 struct c2r_warp_transpose_impl<Array, Indices, odd> {
481  __device__ static void impl(Array& src,
482  const Indices& indices,
483  const int& rotation) {
485  src = rotate(detail::c2r_tx_permute(src), rotation);
486  }
487 };
488 
489 template<typename Array, typename Indices>
490 struct c2r_warp_transpose_impl<Array, Indices, power_of_two> {
491  __device__ static void impl(Array& src,
492  const Indices& indices,
493  const int& rotation) {
494  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
495  int pre_rotation = warp_id >> (LOG_WARP_SIZE - static_log<Array::size>::value);
496  src = rotate(src, pre_rotation);
498  }
499 };
500 
501 template<typename Array, typename Indices>
502 struct c2r_warp_transpose_impl<Array, Indices, composite> {
503  __device__ static void impl(Array& src,
504  const Indices& indices,
505  const int& rotation) {
506  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
507  int pre_rotation = warp_id >> static_log<WARP_SIZE / static_gcd<Array::size, WARP_SIZE>::value>::value;
508  src = rotate(src, pre_rotation);
510  src = rotate(src, rotation);
511  src = composite_c2r_tx_permute(src);
512  }
513 };
514 
515 template<typename Array, typename Schema>
517 
518 template<typename Array>
520  __device__ static void impl(Array& indices, int& rotation) {
521  indices =
522  detail::r2c_compute_offsets<Array::size, odd>();
523  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
524  int size = Array::size;
525  int r =
527  rotation = (warp_id * r) % size;
528  }
529 };
530 
531 template<typename Array>
533  static const int m = Array::size;
534  static const int log_m = static_log<m>::value;
535  static const int clear_m = ~(m-1);
536  static const int n = WARP_SIZE;
537  static const int log_n = static_log<n>::value;
538  static const int mod_n = n-1;
539  static const int n_div_m = WARP_SIZE / m;
540  static const int log_n_div_m = static_log<n_div_m>::value;
541  __device__ static void impl(Array& indices, int& rotation) {
542  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
543  int size = Array::size;
544  rotation = warp_id % size;
545  int initial_offset = ((warp_id << log_m) + (warp_id >> log_n_div_m)) & mod_n;
546  int lb = initial_offset & clear_m;
548  }
549 };
550 
551 template<typename Array>
553  static const int size = Array::size;
555  __device__ static void impl(Array& indices, int& rotation) {
556  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
557  rotation = size - (warp_id % size);
558  int lb = (size * warp_id) & WARP_MASK;
559  int ub = lb + size;
560  int offset = lb + warp_id / (WARP_SIZE / c);
561  indices = detail::r2c_compute_composite_offsets<Array, Array::size>::impl(warp_id, offset, lb, ub);
562  }
563 };
564 
565 template<typename Array, typename Indices, typename Schema>
567 
568 template<typename Array, typename Indices>
569 struct r2c_warp_transpose_impl<Array, Indices, odd> {
570  __device__ static void impl(Array& src,
571  const Indices& indices,
572  const int& rotation) {
573  Array rotated = rotate(src, rotation);
575  src = detail::r2c_tx_permute(rotated);
576  }
577 };
578 
579 template<typename Array, typename Indices>
580 struct r2c_warp_transpose_impl<Array, Indices, power_of_two> {
581  __device__ static void impl(Array& src,
582  const Indices& indices,
583  const int& rotation) {
584  Array rotated = rotate(src, rotation);
586  const int size = Array::size;
587  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
588  src = rotate(detail::r2c_tx_permute(rotated),
589  (size-warp_id/(WARP_SIZE/size))%size);
590  }
591 };
592 
593 template<typename Array, typename Indices>
594 struct r2c_warp_transpose_impl<Array, Indices, composite> {
596  static const int size = Array::size;
597  __device__ static void impl(Array& src,
598  const Indices& indices,
599  const int& rotation) {
600  int warp_id = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & WARP_MASK;
601  src = composite_r2c_tx_permute(src);
602  src = rotate(src, rotation);
604  src = rotate(src, size - (warp_id / (WARP_SIZE / c)));
605  }
606 };
607 
608 } //end namespace detail
609 
610 template<int i>
611 __device__ void c2r_compute_indices(array<int, i>& indices, int& rotation) {
612  typedef array<int, i> Array;
614  Array,
616  ::impl(indices, rotation);
617 
618 }
619 
620 template<typename T, int i>
621 __device__ void c2r_warp_transpose(array<T, i>& src,
622  const array<int, i>& indices,
623  int rotation) {
624  typedef array<T, i> Array;
626  Array, array<int, i>,
628  impl(src, indices, rotation);
629 }
630 
631 template<typename T, int i>
632 __device__ void c2r_warp_transpose(array<T, i>& src) {
633  typedef array<T, i> Array;
634  typedef array<int, i> indices_array;
635  indices_array indices;
636  int rotation;
637  c2r_compute_indices(indices, rotation);
638 
640  Array, array<int, i>,
642  impl(src, indices, rotation);
643 }
644 
645 template<int i>
646 __device__ void r2c_compute_indices(array<int, i>& indices, int& rotation) {
647  typedef array<int, i> Array;
649  Array, typename detail::tx_algorithm<i>::type>
650  ::impl(indices, rotation);
651 
652 }
653 
654 template<typename T, int i>
655 __device__ void r2c_warp_transpose(array<T, i>& src,
656  const array<int, i>& indices,
657  int rotation) {
658  typedef array<T, i> Array;
660  Array, array<int, i>,
662  ::impl(src, indices, rotation);
663 }
664 
665 template<typename T, int i>
666 __device__ void r2c_warp_transpose(array<T, i>& src) {
667  typedef array<T, i> Array;
668  typedef array<int, i> indices_array;
669  indices_array indices;
670  int rotation;
671  r2c_compute_indices(indices, rotation);
672 
674  Array, array<int, i>,
676  ::impl(src, indices, rotation);
677 }
678 
679 } //end namespace trove
__device__ __forceinline__ T __shfl(const T &t, const int &i)
Definition: shfl.h:214
Definition: alias.h:4
__host__ __device__ Array composite_r2c_tx_permute(const Array &t)
Definition: transpose.h:171
__device__ array< int, m > c2r_compute_offsets()
Definition: transpose.h:254
__device__ array< int, m > r2c_compute_offsets()
Definition: transpose.h:403
__host__ __device__ Array c2r_tx_permute(const Array &t)
Definition: transpose.h:154
__host__ __device__ Array composite_c2r_tx_permute(const Array &t)
Definition: transpose.h:164
__host__ __device__ Array r2c_tx_permute(const Array &t)
Definition: transpose.h:180
Definition: aos.h:38
__device__ void r2c_compute_indices(array< int, i > &indices, int &rotation)
Definition: transpose.h:646
__host__ __device__ array< T, i > rotate(const array< T, i > &t, int a)
Definition: rotate.h:113
__device__ void c2r_compute_indices(array< int, i > &indices, int &rotation)
Definition: transpose.h:611
__device__ void r2c_warp_transpose(array< T, i > &src, const array< int, i > &indices, int rotation)
Definition: transpose.h:655
__device__ void c2r_warp_transpose(array< T, i > &src, const array< int, i > &indices, int rotation)
Definition: transpose.h:621
head_type head
Definition: array.h:67
head_type head
Definition: array.h:38
tail_type tail
Definition: array.h:39
__host__ static __device__ result_type impl(int idx, int col)
Definition: transpose.h:302
__host__ static __device__ result_type impl(int idx, int col)
Definition: transpose.h:277
static __device__ void impl(Array &indices, int &rotation)
Definition: transpose.h:468
static __device__ void impl(Array &indices, int &rotation)
Definition: transpose.h:447
static __device__ void impl(Array &indices, int &rotation)
Definition: transpose.h:458
static __device__ void impl(Array &src, const Indices &indices, const int &rotation)
Definition: transpose.h:503
static __device__ void impl(Array &src, const Indices &indices, const int &rotation)
Definition: transpose.h:481
static __device__ void impl(Array &src, const Indices &indices, const int &rotation)
Definition: transpose.h:491
__host__ static __device__ result_type impl(int col, int offset, int lb, int ub)
Definition: transpose.h:395
__host__ static __device__ result_type impl(int col, int offset, int lb, int ub)
Definition: transpose.h:376
static __device__ void impl(Array &indices, int &rotation)
Definition: transpose.h:555
static __device__ void impl(Array &indices, int &rotation)
Definition: transpose.h:520
static __device__ void impl(Array &indices, int &rotation)
Definition: transpose.h:541
static const int value
Definition: transpose.h:313
static __device__ void impl(Array &src, const Indices &indices, const int &rotation)
Definition: transpose.h:597
static __device__ void impl(Array &src, const Indices &indices, const int &rotation)
Definition: transpose.h:570
static __device__ void impl(Array &src, const Indices &indices, const int &rotation)
Definition: transpose.h:581
__host__ static __device__ Remaining impl(const Source &src)
Definition: transpose.h:118
__host__ static __device__ Remaining impl(const Source &src)
Definition: transpose.h:104
static __device__ void impl(array< T, 1 > &d, const array< int, 1 > &i)
Definition: transpose.h:431
static __device__ void impl(array< T, m > &d, const array< int, m > &i)
Definition: transpose.h:417
#define WARP_CONVERGED
Definition: transpose.h:36
#define LOG_WARP_SIZE
Definition: warp.h:47
#define WARP_MASK
Definition: warp.h:46
#define WARP_SIZE
Definition: warp.h:45