QUDA  v1.1.0
A library for QCD on GPUs
dbldbl.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  *
10  * Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  *
14  * Neither the name of NVIDIA Corporation nor the names of its contributors
15  * may be used to endorse or promote products derived from this software
16  * without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Release 1.2
33  *
34  * (1) Deployed new implementation of div_dbldbl() and sqrt_dbldbl() based on
35  * Newton-Raphson iteration, providing significant speedup.
36  * (2) Added new function rsqrt_dbldbl() which provides reciprocal square root.
37  *
38  * Release 1.1
39  *
40  * (1) Fixed a bug affecting add_dbldbl() and sub_dbldbl() that in very rare
41  * cases returned results with reduced accuracy.
42  * (2) Replaced the somewhat inaccurate error bounds with the experimentally
43  * observed maximum relative error.
44  */
45 
46 #if !defined(DBLDBL_H_)
47 #define DBLDBL_H_
48 
49 #if defined(__cplusplus)
50 extern "C" {
51 #endif /* __cplusplus */
52 
53 #include <math.h> /* import sqrt() */
54 
55 /* The head of a double-double number is stored in the most significant part
56  of a double2 (the y-component). The tail is stored in the least significant
57  part of the double2 (the x-component). All double-double operands must be
58  normalized on both input to and return from all basic operations, i.e. the
59  magnitude of the tail shall be <= 0.5 ulp of the head.
60 */
61 typedef double2 dbldbl;
62 
63 /* Create a double-double from two doubles. No normalization is performed,
64  so the head and tail components passed in must satisfy the normalization
65  requirement. To create a double-double from two arbitrary double-precision
66  numbers, use add_double_to_dbldbl().
67 */
68 __device__ __forceinline__ dbldbl make_dbldbl (double head, double tail)
69 {
70  dbldbl z;
71  z.x = tail;
72  z.y = head;
73  return z;
74 }
75 
76 /* Return the head of a double-double number */
77 __device__ __forceinline__ double get_dbldbl_head (dbldbl a)
78 {
79  return a.y;
80 }
81 
82 /* Return the tail of a double-double number */
83 __device__ __forceinline__ double get_dbldbl_tail (dbldbl a)
84 {
85  return a.x;
86 }
87 
88 /* Compute error-free sum of two unordered doubles. See Knuth, TAOCP vol. 2 */
89 __device__ __forceinline__ dbldbl add_double_to_dbldbl (double a, double b)
90 {
91  double t1, t2;
92  dbldbl z;
93  z.y = __dadd_rn (a, b);
94  t1 = __dadd_rn (z.y, -a);
95  t2 = __dadd_rn (z.y, -t1);
96  t1 = __dadd_rn (b, -t1);
97  t2 = __dadd_rn (a, -t2);
98  z.x = __dadd_rn (t1, t2);
99  return z;
100 }
101 
102 /* Compute error-free product of two doubles. Take full advantage of FMA */
103 __device__ __forceinline__ dbldbl mul_double_to_dbldbl (double a, double b)
104 {
105  dbldbl z;
106  z.y = __dmul_rn (a, b);
107  z.x = __fma_rn (a, b, -z.y);
108  return z;
109 }
110 
111 /* Negate a double-double number, by separately negating head and tail */
112 __device__ __forceinline__ dbldbl neg_dbldbl (dbldbl a)
113 {
114  dbldbl z;
115  z.y = -a.y;
116  z.x = -a.x;
117  return z;
118 }
119 
120 /* Compute high-accuracy sum of two double-double operands. In the absence of
121  underflow and overflow, the maximum relative error observed with 10 billion
122  test cases was 3.0716194922303448e-32 (~= 2**-104.6826).
123  This implementation is based on: Andrew Thall, Extended-Precision
124  Floating-Point Numbers for GPU Computation. Retrieved on 7/12/2011
125  from http://andrewthall.org/papers/df64_qf128.pdf.
126 */
127 __device__ __forceinline__ dbldbl add_dbldbl (dbldbl a, dbldbl b)
128 {
129  dbldbl z;
130  double t1, t2, t3, t4, t5, e;
131  t1 = __dadd_rn (a.y, b.y);
132  t2 = __dadd_rn (t1, -a.y);
133  t3 = __dadd_rn (__dadd_rn (a.y, t2 - t1), __dadd_rn (b.y, -t2));
134  t4 = __dadd_rn (a.x, b.x);
135  t2 = __dadd_rn (t4, -a.x);
136  t5 = __dadd_rn (__dadd_rn (a.x, t2 - t4), __dadd_rn (b.x, -t2));
137  t3 = __dadd_rn (t3, t4);
138  t4 = __dadd_rn (t1, t3);
139  t3 = __dadd_rn (t1 - t4, t3);
140  t3 = __dadd_rn (t3, t5);
141  z.y = e = __dadd_rn (t4, t3);
142  z.x = __dadd_rn (t4 - e, t3);
143  return z;
144 }
145 
146 /* Compute high-accuracy difference of two double-double operands. In the
147  absence of underflow and overflow, the maximum relative error observed
148  with 10 billion test cases was 3.0716194922303448e-32 (~= 2**-104.6826).
149  This implementation is based on: Andrew Thall, Extended-Precision
150  Floating-Point Numbers for GPU Computation. Retrieved on 7/12/2011
151  from http://andrewthall.org/papers/df64_qf128.pdf.
152 */
153 __device__ __forceinline__ dbldbl sub_dbldbl (dbldbl a, dbldbl b)
154 {
155  dbldbl z;
156  double t1, t2, t3, t4, t5, e;
157  t1 = __dadd_rn (a.y, -b.y);
158  t2 = __dadd_rn (t1, -a.y);
159  t3 = __dadd_rn (__dadd_rn (a.y, t2 - t1), - __dadd_rn (b.y, t2));
160  t4 = __dadd_rn (a.x, -b.x);
161  t2 = __dadd_rn (t4, -a.x);
162  t5 = __dadd_rn (__dadd_rn (a.x, t2 - t4), - __dadd_rn (b.x, t2));
163  t3 = __dadd_rn (t3, t4);
164  t4 = __dadd_rn (t1, t3);
165  t3 = __dadd_rn (t1 - t4, t3);
166  t3 = __dadd_rn (t3, t5);
167  z.y = e = __dadd_rn (t4, t3);
168  z.x = __dadd_rn (t4 - e, t3);
169  return z;
170 }
171 
172 /* Compute high-accuracy product of two double-double operands, taking full
173  advantage of FMA. In the absence of underflow and overflow, the maximum
174  relative error observed with 10 billion test cases was 5.238480533564479e-32
175  (~= 2**-103.9125).
176 */
177 __device__ __forceinline__ dbldbl mul_dbldbl (dbldbl a, dbldbl b)
178 {
179  dbldbl t, z;
180  double e;
181  t.y = __dmul_rn (a.y, b.y);
182  t.x = __fma_rn (a.y, b.y, -t.y);
183  t.x = __fma_rn (a.x, b.x, t.x);
184  t.x = __fma_rn (a.y, b.x, t.x);
185  t.x = __fma_rn (a.x, b.y, t.x);
186  z.y = e = __dadd_rn (t.y, t.x);
187  z.x = __dadd_rn (t.y - e, t.x);
188  return z;
189 }
190 
191 /* Compute high-accuracy quotient of two double-double operands, using Newton-
192  Raphson iteration. Based on: T. Nagai, H. Yoshida, H. Kuroda, Y. Kanada.
193  Fast Quadruple Precision Arithmetic Library on Parallel Computer SR11000/J2.
194  In Proceedings of the 8th International Conference on Computational Science,
195  ICCS '08, Part I, pp. 446-455. In the absence of underflow and overflow, the
196  maximum relative error observed with 10 billion test cases was
197  1.0161322480099059e-31 (~= 2**-102.9566).
198 */
199 __device__ __forceinline__ dbldbl div_dbldbl (dbldbl a, dbldbl b)
200 {
201  dbldbl t, z;
202  double e, r;
203  r = 1.0 / b.y;
204  t.y = __dmul_rn (a.y, r);
205  e = __fma_rn (b.y, -t.y, a.y);
206  t.y = __fma_rn (r, e, t.y);
207  t.x = __fma_rn (b.y, -t.y, a.y);
208  t.x = __dadd_rn (a.x, t.x);
209  t.x = __fma_rn (b.x, -t.y, t.x);
210  e = __dmul_rn (r, t.x);
211  t.x = __fma_rn (b.y, -e, t.x);
212  t.x = __fma_rn (r, t.x, e);
213  z.y = e = __dadd_rn (t.y, t.x);
214  z.x = __dadd_rn (t.y - e, t.x);
215  return z;
216 }
217 
218 /* Compute high-accuracy square root of a double-double number. Newton-Raphson
219  iteration based on equation 4 from a paper by Alan Karp and Peter Markstein,
220  High Precision Division and Square Root, ACM TOMS, vol. 23, no. 4, December
221  1997, pp. 561-589. In the absence of underflow and overflow, the maximum
222  relative error observed with 10 billion test cases was
223  3.7564109505601846e-32 (~= 2**-104.3923).
224 */
225 __device__ __forceinline__ dbldbl sqrt_dbldbl (dbldbl a)
226 {
227  dbldbl t, z;
228  double e, y, s, r;
229  r = rsqrt (a.y);
230  if (a.y == 0.0) r = 0.0;
231  y = __dmul_rn (a.y, r);
232  s = __fma_rn (y, -y, a.y);
233  r = __dmul_rn (0.5, r);
234  z.y = e = __dadd_rn (s, a.x);
235  z.x = __dadd_rn (s - e, a.x);
236  t.y = __dmul_rn (r, z.y);
237  t.x = __fma_rn (r, z.y, -t.y);
238  t.x = __fma_rn (r, z.x, t.x);
239  r = __dadd_rn (y, t.y);
240  s = __dadd_rn (y - r, t.y);
241  s = __dadd_rn (s, t.x);
242  z.y = e = __dadd_rn (r, s);
243  z.x = __dadd_rn (r - e, s);
244  return z;
245 }
246 
247 /* Compute high-accuracy reciprocal square root of a double-double number.
248  Based on Newton-Raphson iteration. In the absence of underflow and overflow,
249  the maximum relative error observed with 10 billion test cases was
250  6.4937771666026349e-32 (~= 2**-103.6026)
251 */
252 __device__ __forceinline__ dbldbl rsqrt_dbldbl (dbldbl a)
253 {
254  dbldbl z;
255  double r, s, e;
256  r = rsqrt (a.y);
257  e = __dmul_rn (a.y, r);
258  s = __fma_rn (e, -r, 1.0);
259  e = __fma_rn (a.y, r, -e);
260  s = __fma_rn (e, -r, s);
261  e = __dmul_rn (a.x, r);
262  s = __fma_rn (e, -r, s);
263  e = 0.5 * r;
264  z.y = __dmul_rn (e, s);
265  z.x = __fma_rn (e, s, -z.y);
266  s = __dadd_rn (r, z.y);
267  r = __dadd_rn (r, -s);
268  r = __dadd_rn (r, z.y);
269  r = __dadd_rn (r, z.x);
270  z.y = e = __dadd_rn (s, r);
271  z.x = __dadd_rn (s - e, r);
272  return z;
273 }
274 
275 #if defined(__cplusplus)
276 }
277 #endif /* __cplusplus */
278 
283 struct doubledouble {
284 
286 
287  __device__ __host__ doubledouble() { a.x = 0.0; a.y = 0.0; }
288  __device__ __host__ doubledouble(const doubledouble &a) : a(a.a) { }
289  __device__ __host__ doubledouble(const dbldbl &a) : a(a) { }
290  __device__ __host__ doubledouble(const double &head, const double &tail) { a.y = head; a.x = tail; }
291  __device__ __host__ doubledouble(const double &head) { a.y = head; a.x = 0.0; }
292 
293  __device__ __host__ doubledouble& operator=(const double &head) {
294  this->a.y = head;
295  this->a.x = 0.0;
296  }
297 
298  __device__ doubledouble& operator+=(const doubledouble &a) {
299  this->a = add_dbldbl(this->a, a.a);
300  return *this;
301  }
302 
303  __device__ __host__ double head() const { return a.y; }
304  __device__ __host__ double tail() const { return a.x; }
305 
306  __device__ __host__ void print() const { printf("scalar: %16.14e + %16.14e\n", head(), tail()); }
307 
308 };
309 
310 __device__ inline bool operator>(const doubledouble &a, const double &b) {
311  return a.head() > b;
312 }
313 
314 __device__ inline doubledouble operator+(const doubledouble &a, const doubledouble &b) {
315  return doubledouble(add_dbldbl(a.a,b.a));
316 }
317 
318 __device__ inline doubledouble operator-(const doubledouble &a, const doubledouble &b) {
319  return doubledouble(sub_dbldbl(a.a,b.a));
320 }
321 
322 __device__ inline doubledouble operator*(const doubledouble &a, const doubledouble &b) {
323  return doubledouble(mul_dbldbl(a.a,b.a));
324 }
325 
326 __device__ inline doubledouble operator/(const doubledouble &a, const doubledouble &b) {
327  return doubledouble(div_dbldbl(a.a,b.a));
328 }
329 
330 __device__ inline doubledouble add_double_to_doubledouble(const double &a, const double &b) {
331  return doubledouble(add_double_to_dbldbl(a,b));
332 }
333 
334 __device__ inline doubledouble mul_double_to_doubledouble(const double &a, const double &b) {
335  return doubledouble(mul_double_to_dbldbl(a,b));
336 }
337 
341 
342  __device__ __host__ doubledouble2() : x(), y() { }
343  __device__ __host__ doubledouble2(const doubledouble2 &a) : x(a.x), y(a.y) { }
344  __device__ __host__ doubledouble2(const double2 &a) : x(a.x), y(a.y) { }
345  __device__ __host__ doubledouble2(const doubledouble &x, const doubledouble &y) : x(x), y(y) { }
346 
347  __device__ doubledouble2& operator+=(const doubledouble2 &a) {
348  x += a.x;
349  y += a.y;
350  return *this;
351  }
352 
353  __device__ __host__ void print() const { printf("vec2: (%16.14e + %16.14e) (%16.14e + %16.14e)\n", x.head(), x.tail(), y.head(), y.tail()); }
354 };
355 
360 
361  __device__ __host__ doubledouble3() : x(), y() { }
362  __device__ __host__ doubledouble3(const doubledouble3 &a) : x(a.x), y(a.y), z(a.z) { }
363  __device__ __host__ doubledouble3(const double3 &a) : x(a.x), y(a.y), z(a.z) { }
364  __device__ __host__ doubledouble3(const doubledouble &x, const doubledouble &y, const doubledouble &z) : x(x), y(y), z(z) { }
365 
366  __device__ doubledouble3& operator+=(const doubledouble3 &a) {
367  x += a.x;
368  y += a.y;
369  z += a.z;
370  return *this;
371  }
372 
373  __device__ __host__ void print() const { printf("vec3: (%16.14e + %16.14e) (%16.14e + %16.14e) (%16.14e + %16.14e)\n", x.head(), x.tail(), y.head(), y.tail(), z.head(), z.tail()); }
374 };
375 
376 __device__ doubledouble2 operator+(const doubledouble2 &a, const doubledouble2 &b)
377 { return doubledouble2(a.x + b.x, a.y + b.y); }
378 
379 __device__ doubledouble3 operator+(const doubledouble3 &a, const doubledouble3 &b)
380 { return doubledouble3(a.x + b.x, a.y + b.y, a.z + b.z); }
381 
382 #endif /* DBLDBL_H_ */
__device__ doubledouble operator+(const doubledouble &a, const doubledouble &b)
Definition: dbldbl.h:314
__device__ doubledouble operator/(const doubledouble &a, const doubledouble &b)
Definition: dbldbl.h:326
__device__ __forceinline__ dbldbl neg_dbldbl(dbldbl a)
Definition: dbldbl.h:112
__device__ __forceinline__ dbldbl make_dbldbl(double head, double tail)
Definition: dbldbl.h:68
__device__ doubledouble add_double_to_doubledouble(const double &a, const double &b)
Definition: dbldbl.h:330
__device__ __forceinline__ dbldbl sub_dbldbl(dbldbl a, dbldbl b)
Definition: dbldbl.h:153
__device__ doubledouble mul_double_to_doubledouble(const double &a, const double &b)
Definition: dbldbl.h:334
__device__ bool operator>(const doubledouble &a, const double &b)
Definition: dbldbl.h:310
__device__ __forceinline__ double get_dbldbl_head(dbldbl a)
Definition: dbldbl.h:77
__device__ __forceinline__ dbldbl mul_dbldbl(dbldbl a, dbldbl b)
Definition: dbldbl.h:177
__device__ __forceinline__ dbldbl add_dbldbl(dbldbl a, dbldbl b)
Definition: dbldbl.h:127
__device__ doubledouble operator*(const doubledouble &a, const doubledouble &b)
Definition: dbldbl.h:322
__device__ __forceinline__ dbldbl mul_double_to_dbldbl(double a, double b)
Definition: dbldbl.h:103
__device__ __forceinline__ dbldbl rsqrt_dbldbl(dbldbl a)
Definition: dbldbl.h:252
double2 dbldbl
Definition: dbldbl.h:61
__device__ __forceinline__ dbldbl sqrt_dbldbl(dbldbl a)
Definition: dbldbl.h:225
__device__ doubledouble operator-(const doubledouble &a, const doubledouble &b)
Definition: dbldbl.h:318
__device__ __forceinline__ dbldbl div_dbldbl(dbldbl a, dbldbl b)
Definition: dbldbl.h:199
__device__ __forceinline__ double get_dbldbl_tail(dbldbl a)
Definition: dbldbl.h:83
__device__ __forceinline__ dbldbl add_double_to_dbldbl(double a, double b)
Definition: dbldbl.h:89
doubledouble x
Definition: dbldbl.h:339
doubledouble y
Definition: dbldbl.h:340
__device__ __host__ doubledouble2(const doubledouble2 &a)
Definition: dbldbl.h:343
__device__ __host__ doubledouble2(const doubledouble &x, const doubledouble &y)
Definition: dbldbl.h:345
__device__ __host__ void print() const
Definition: dbldbl.h:353
__device__ __host__ doubledouble2(const double2 &a)
Definition: dbldbl.h:344
__device__ __host__ doubledouble2()
Definition: dbldbl.h:342
__device__ doubledouble2 & operator+=(const doubledouble2 &a)
Definition: dbldbl.h:347
__device__ __host__ doubledouble3(const doubledouble3 &a)
Definition: dbldbl.h:362
__device__ doubledouble3 & operator+=(const doubledouble3 &a)
Definition: dbldbl.h:366
__device__ __host__ doubledouble3()
Definition: dbldbl.h:361
__device__ __host__ doubledouble3(const doubledouble &x, const doubledouble &y, const doubledouble &z)
Definition: dbldbl.h:364
doubledouble x
Definition: dbldbl.h:357
doubledouble y
Definition: dbldbl.h:358
__device__ __host__ doubledouble3(const double3 &a)
Definition: dbldbl.h:363
doubledouble z
Definition: dbldbl.h:359
__device__ __host__ void print() const
Definition: dbldbl.h:373
__device__ __host__ double tail() const
Definition: dbldbl.h:304
__device__ doubledouble & operator+=(const doubledouble &a)
Definition: dbldbl.h:298
__device__ __host__ void print() const
Definition: dbldbl.h:306
__device__ __host__ doubledouble(const doubledouble &a)
Definition: dbldbl.h:288
__device__ __host__ doubledouble(const double &head, const double &tail)
Definition: dbldbl.h:290
__device__ __host__ doubledouble()
Definition: dbldbl.h:287
__device__ __host__ doubledouble & operator=(const double &head)
Definition: dbldbl.h:293
dbldbl a
Definition: dbldbl.h:285
__device__ __host__ doubledouble(const double &head)
Definition: dbldbl.h:291
__device__ __host__ doubledouble(const dbldbl &a)
Definition: dbldbl.h:289
__device__ __host__ double head() const
Definition: dbldbl.h:303