QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
atomic.cuh
Go to the documentation of this file.
1 #pragma once
2 
13 #if defined(__CUDA_ARCH__)
14 
15 #if __COMPUTE_CAPABILITY__ < 600
16 
23 static inline __device__ double atomicAdd(double* address, double val)
24 {
25  unsigned long long int* address_as_ull =
26  (unsigned long long int*)address;
27  unsigned long long int old = *address_as_ull, assumed;
28 
29  do {
30  assumed = old;
31  old = atomicCAS(address_as_ull, assumed,
32  __double_as_longlong(val +
33  __longlong_as_double(assumed)));
34 
35  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
36  } while (assumed != old);
37 
38  return __longlong_as_double(old);
39 }
40 #endif
41 
42 #endif
43 
51 static inline __device__ double2 atomicAdd(double2 *addr, double2 val){
52  double2 old = *addr;
53  // This is a necessary evil to avoid conflicts between the atomicAdd
54  // declared in CUDA 8.0+ headers which are visible for host
55  // compilation, which cause a conflict when compiled on clang-cuda.
56  // As a result we do not support any architecture without native
57  // double precision atomics on clang-cuda.
58 #if defined(__CUDA_ARCH__) || CUDA_VERSION >= 8000
59  old.x = atomicAdd((double*)addr, val.x);
60  old.y = atomicAdd((double*)addr + 1, val.y);
61 #endif
62  return old;
63 }
64 
72 static inline __device__ float2 atomicAdd(float2 *addr, float2 val){
73  float2 old = *addr;
74  old.x = atomicAdd((float*)addr, val.x);
75  old.y = atomicAdd((float*)addr + 1, val.y);
76  return old;
77 }
78 
86 static inline __device__ int2 atomicAdd(int2 *addr, int2 val){
87  int2 old = *addr;
88  old.x = atomicAdd((int*)addr, val.x);
89  old.y = atomicAdd((int*)addr + 1, val.y);
90  return old;
91 }
92 
93 union uint32_short2 { unsigned int i; short2 s; };
94 
102 static inline __device__ short2 atomicAdd(short2 *addr, short2 val){
103  uint32_short2 old, assumed, incremented;
104  old.s = *addr;
105  do {
106  assumed.s = old.s;
107  incremented.s = make_short2(val.x + assumed.s.x, val.y + assumed.s.y);
108  old.i = atomicCAS((unsigned int*)addr, assumed.i, incremented.i);
109  } while ( assumed.i != old.i );
110 
111  return old.s;
112 }
113 
114 union uint32_char2 { unsigned short i; char2 s; };
115 
123 static inline __device__ char2 atomicAdd(char2 *addr, char2 val){
124  uint32_char2 old, assumed, incremented;
125  old.s = *addr;
126  do {
127  assumed.s = old.s;
128  incremented.s = make_char2(val.x + assumed.s.x, val.y + assumed.s.y);
129  old.i = atomicCAS((unsigned int*)addr, assumed.i, incremented.i);
130  } while ( assumed.i != old.i );
131 
132  return old.s;
133 }
134 
142 static inline __device__ float atomicMax(float *addr, float val){
143  unsigned int old = __float_as_uint(*addr), assumed;
144  do {
145  assumed = old;
146  if (__uint_as_float(old) >= val) break;
147 
148  old = atomicCAS((unsigned int*)addr,
149  assumed,
150  __float_as_uint(val));
151  } while ( assumed != old );
152 
153  return __uint_as_float(old);
154 }
unsigned int i
Definition: atomic.cuh:93
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
Definition: atomic.cuh:51
short2 s
Definition: atomic.cuh:93
unsigned short i
Definition: atomic.cuh:114
static __device__ float atomicMax(float *addr, float val)
Implementation of single-precision atomic max using compare and swap. May not support NaNs properly...
Definition: atomic.cuh:142