QUDA  v1.1.0
A library for QCD on GPUs
ldg.h
Go to the documentation of this file.
1 #pragma once
3 
4 namespace detail {
5 
6 template<typename T,
7  typename U=typename working_type<T>::type,
9 struct load_storage {
11  static const int idx = aliased_size<T, U>::value - r;
12  __device__ __forceinline__
13  static result_type impl(const T* ptr) {
14  return result_type(__ldg(((const U*)ptr) + idx),
16  }
17 };
18 
19 template<typename T, typename U>
20 struct load_storage<T, U, 1> {
22  static const int idx = aliased_size<T, U>::value - 1;
23  __device__ __forceinline__
24  static result_type impl(const T* ptr) {
25  return result_type(__ldg(((const U*)ptr) + idx));
26  }
27 };
28 
29 }
30 
31 
32 #if __CUDA_ARCH__ >= 350
33 // Device has ldg
34 template<typename T>
35 __device__ __forceinline__ T __ldg(const T* ptr) {
36  typedef typename detail::working_array<T>::type aliased;
37  aliased storage = detail::load_storage<T>::impl(ptr);
38  return detail::fuse<T>(storage);
39 }
40 
41 #else
42 //Device does not, fall back.
43 template<typename T>
44 __device__ __forceinline__ T __ldg(const T* ptr) {
45  return *ptr;
46 }
47 
48 #endif
__device__ __forceinline__ T __ldg(const T *ptr)
Definition: ldg.h:44
Definition: alias.h:4
static const int value
Definition: alias.h:25
array< U, 1 > result_type
Definition: ldg.h:21
__device__ static __forceinline__ result_type impl(const T *ptr)
Definition: ldg.h:24
__device__ static __forceinline__ result_type impl(const T *ptr)
Definition: ldg.h:13
array< U, r > result_type
Definition: ldg.h:10
static const int idx
Definition: ldg.h:11