13 #if defined(__CUDA_ARCH__) 15 #if __COMPUTE_CAPABILITY__ < 600 23 static inline __device__
double atomicAdd(
double* address,
double val)
25 unsigned long long int* address_as_ull =
26 (
unsigned long long int*)address;
27 unsigned long long int old = *address_as_ull, assumed;
31 old = atomicCAS(address_as_ull, assumed,
32 __double_as_longlong(val +
33 __longlong_as_double(assumed)));
36 }
while (assumed != old);
38 return __longlong_as_double(old);
51 static inline __device__ double2
atomicAdd(double2 *addr, double2 val){
58 #if defined(__CUDA_ARCH__) || CUDA_VERSION >= 8000 60 old.y =
atomicAdd((
double*)addr + 1, val.y);
72 static inline __device__ float2
atomicAdd(float2 *addr, float2 val){
75 old.y =
atomicAdd((
float*)addr + 1, val.y);
86 static inline __device__ int2
atomicAdd(int2 *addr, int2 val){
102 static inline __device__ short2
atomicAdd(short2 *addr, short2 val){
107 incremented.
s = make_short2(val.x + assumed.
s.x, val.y + assumed.
s.y);
108 old.
i = atomicCAS((
unsigned int*)addr, assumed.
i, incremented.
i);
109 }
while ( assumed.
i != old.
i );
123 static inline __device__ char2
atomicAdd(char2 *addr, char2 val){
128 incremented.
s = make_char2(val.x + assumed.
s.x, val.y + assumed.
s.y);
129 old.
i = atomicCAS((
unsigned int*)addr, assumed.
i, incremented.
i);
130 }
while ( assumed.
i != old.
i );
142 static inline __device__
float atomicMax(
float *addr,
float val){
143 unsigned int old = __float_as_uint(*addr), assumed;
146 if (__uint_as_float(old) >= val)
break;
148 old = atomicCAS((
unsigned int*)addr,
150 __float_as_uint(val));
151 }
while ( assumed != old );
153 return __uint_as_float(old);
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
static __device__ float atomicMax(float *addr, float val)
Implementation of single-precision atomic max using compare and swap. May not support NaNs properly...