QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
convert.h
Go to the documentation of this file.
1 #pragma once
2 
11 #include <quda_internal.h> // for maximum short, char traits.
12 
13 namespace quda
14 {
15 
16  template <typename type> inline int vecLength() { return 0; }
17 
18  template <> inline int vecLength<char>() { return 1; }
19  template <> inline int vecLength<short>() { return 1; }
20  template <> inline int vecLength<float>() { return 1; }
21  template <> inline int vecLength<double>() { return 1; }
22 
23  template <> inline int vecLength<char2>() { return 2; }
24  template <> inline int vecLength<short2>() { return 2; }
25  template <> inline int vecLength<float2>() { return 2; }
26  template <> inline int vecLength<double2>() { return 2; }
27 
28  template <> inline int vecLength<char4>() { return 4; }
29  template <> inline int vecLength<short4>() { return 4; }
30  template <> inline int vecLength<float4>() { return 4; }
31  template <> inline int vecLength<double4>() { return 4; }
32 
33  // specializations for short-float conversion
34  inline __host__ __device__ float s2f(short a) { return static_cast<float>(a) * fixedInvMaxValue<short>::value; }
35  inline __host__ __device__ double s2d(short a) { return static_cast<double>(a) * fixedInvMaxValue<short>::value; }
36 
37  // specializations for char-float conversion
38  inline __host__ __device__ float c2f(char a) { return static_cast<float>(a) * fixedInvMaxValue<char>::value; }
39  inline __host__ __device__ double c2d(char a) { return static_cast<double>(a) * fixedInvMaxValue<char>::value; }
40 
41  // specializations for short-float conversion with additional scale factor
42  inline __host__ __device__ float s2f(short a, float c)
43  {
44  return static_cast<float>(a) * (fixedInvMaxValue<short>::value * c);
45  }
46  inline __host__ __device__ double s2d(short a, double c)
47  {
48  return static_cast<double>(a) * (fixedInvMaxValue<short>::value * c);
49  }
50 
51  // specializations for char-float conversion with additional scale factor
52  inline __host__ __device__ float c2f(char a, float c)
53  {
54  return static_cast<float>(a) * (fixedInvMaxValue<char>::value * c);
55  }
56  inline __host__ __device__ double c2d(char a, double c)
57  {
58  return static_cast<double>(a) * (fixedInvMaxValue<char>::value * c);
59  }
60 
61  template <typename FloatN> __device__ inline void copyFloatN(FloatN &a, const FloatN &b) { a = b; }
62 
63  // This is emulating the texture normalized return: char
64  __device__ inline void copyFloatN(float2 &a, const char2 &b) { a = make_float2(c2f(b.x), c2f(b.y)); }
65  __device__ inline void copyFloatN(float4 &a, const char4 &b)
66  {
67  a = make_float4(c2f(b.x), c2f(b.y), c2f(b.z), c2f(b.w));
68  }
69  __device__ inline void copyFloatN(double2 &a, const char2 &b) { a = make_double2(c2d(b.x), c2d(b.y)); }
70  __device__ inline void copyFloatN(double4 &a, const char4 &b)
71  {
72  a = make_double4(c2d(b.x), c2d(b.y), c2d(b.z), c2d(b.w));
73  }
74 
75  // This is emulating the texture normalized return: short
76  __device__ inline void copyFloatN(float2 &a, const short2 &b) { a = make_float2(s2f(b.x), s2f(b.y)); }
77  __device__ inline void copyFloatN(float4 &a, const short4 &b)
78  {
79  a = make_float4(s2f(b.x), s2f(b.y), s2f(b.z), s2f(b.w));
80  }
81  __device__ inline void copyFloatN(double2 &a, const short2 &b) { a = make_double2(s2d(b.x), s2d(b.y)); }
82  __device__ inline void copyFloatN(double4 &a, const short4 &b)
83  {
84  a = make_double4(s2d(b.x), s2d(b.y), s2d(b.z), s2d(b.w));
85  }
86 
87  __device__ inline void copyFloatN(float2 &a, const double2 &b) { a = make_float2(b.x, b.y); }
88  __device__ inline void copyFloatN(double2 &a, const float2 &b) { a = make_double2(b.x, b.y); }
89  __device__ inline void copyFloatN(float4 &a, const double4 &b) { a = make_float4(b.x, b.y, b.z, b.w); }
90  __device__ inline void copyFloatN(double4 &a, const float4 &b) { a = make_double4(b.x, b.y, b.z, b.w); }
91 
92  // Fast float to integer round
93  __device__ __host__ inline int f2i(float f)
94  {
95 #ifdef __CUDA_ARCH__
96  f += 12582912.0f;
97  return reinterpret_cast<int &>(f);
98 #else
99  return static_cast<int>(f);
100 #endif
101  }
102 
103  // Fast double to integer round
104  __device__ __host__ inline int d2i(double d)
105  {
106 #ifdef __CUDA_ARCH__
107  d += 6755399441055744.0;
108  return reinterpret_cast<int &>(d);
109 #else
110  return static_cast<int>(d);
111 #endif
112  }
113 
114  /* Here we assume that the input data has already been normalized and shifted. */
115  __device__ inline void copyFloatN(short2 &a, const float2 &b) { a = make_short2(f2i(b.x), f2i(b.y)); }
116  __device__ inline void copyFloatN(short4 &a, const float4 &b)
117  {
118  a = make_short4(f2i(b.x), f2i(b.y), f2i(b.z), f2i(b.w));
119  }
120  __device__ inline void copyFloatN(short2 &a, const double2 &b) { a = make_short2(d2i(b.x), d2i(b.y)); }
121  __device__ inline void copyFloatN(short4 &a, const double4 &b)
122  {
123  a = make_short4(d2i(b.x), d2i(b.y), d2i(b.z), d2i(b.w));
124  }
125 
126  __device__ inline void copyFloatN(char2 &a, const float2 &b) { a = make_char2(f2i(b.x), f2i(b.y)); }
127  __device__ inline void copyFloatN(char4 &a, const float4 &b)
128  {
129  a = make_char4(f2i(b.x), f2i(b.y), f2i(b.z), f2i(b.w));
130  }
131  __device__ inline void copyFloatN(char2 &a, const double2 &b) { a = make_char2(d2i(b.x), d2i(b.y)); }
132  __device__ inline void copyFloatN(char4 &a, const double4 &b)
133  {
134  a = make_char4(d2i(b.x), d2i(b.y), d2i(b.z), d2i(b.w));
135  }
136 
148  template <typename OutputType, typename InputType>
149  __device__ inline void convert(OutputType x[], InputType y[], const int N)
150  {
151  // default is one-2-one conversion, e.g., matching vector lengths and precisions
152 #pragma unroll
153  for (int j = 0; j < N; j++) copyFloatN(x[j], y[j]);
154  }
155 
156  template <> __device__ inline void convert<float2, short2>(float2 x[], short2 y[], const int N)
157  {
158 #pragma unroll
159  for (int j = 0; j < N; j++) x[j] = make_float2(y[j].x, y[j].y);
160  }
161 
162  template <> __device__ inline void convert<float4, short4>(float4 x[], short4 y[], const int N)
163  {
164 #pragma unroll
165  for (int j = 0; j < N; j++) x[j] = make_float4(y[j].x, y[j].y, y[j].z, y[j].w);
166  }
167 
168 // 4 <-> 2 vector conversion
169 
170 template <> __device__ inline void convert<double4, double2>(double4 x[], double2 y[], const int N)
171 {
172 #pragma unroll
173  for (int j = 0; j < N; j++) x[j] = make_double4(y[2 * j].x, y[2 * j].y, y[2 * j + 1].x, y[2 * j + 1].y);
174 }
175 
176 template <> __device__ inline void convert<double2, double4>(double2 x[], double4 y[], const int N)
177 {
178 #pragma unroll
179  for (int j = 0; j < N / 2; j++) {
180  x[2 * j] = make_double2(y[j].x, y[j].y);
181  x[2 * j + 1] = make_double2(y[j].z, y[j].w);
182  }
183 }
184 
185 template <> __device__ inline void convert<float4, float2>(float4 x[], float2 y[], const int N)
186 {
187 #pragma unroll
188  for (int j = 0; j < N; j++) x[j] = make_float4(y[2 * j].x, y[2 * j].y, y[2 * j + 1].x, y[2 * j + 1].y);
189 }
190 
191 template <> __device__ inline void convert<float2, float4>(float2 x[], float4 y[], const int N)
192 {
193 #pragma unroll
194  for (int j = 0; j < N / 2; j++) {
195  x[2 * j] = make_float2(y[j].x, y[j].y);
196  x[2 * j + 1] = make_float2(y[j].z, y[j].w);
197  }
198 }
199 
200 template <> __device__ inline void convert<short4, float2>(short4 x[], float2 y[], const int N)
201 {
202 #pragma unroll
203  for (int j = 0; j < N; j++)
204  x[j] = make_short4(f2i(y[2 * j].x), f2i(y[2 * j].y), f2i(y[2 * j + 1].x), f2i(y[2 * j + 1].y));
205 }
206 
207 template <> __device__ inline void convert<float2, short4>(float2 x[], short4 y[], const int N)
208 {
209 #pragma unroll
210  for (int j = 0; j < N / 2; j++) {
211  x[2 * j] = make_float2(y[j].x, y[j].y);
212  x[2 * j + 1] = make_float2(y[j].z, y[j].w);
213  }
214 }
215 
216 template <> __device__ inline void convert<float4, short2>(float4 x[], short2 y[], const int N)
217 {
218 #pragma unroll
219  for (int j = 0; j < N; j++) x[j] = make_float4(y[2 * j].x, y[2 * j].y, y[2 * j + 1].x, y[2 * j + 1].y);
220 }
221 
222 template <> __device__ inline void convert<short2, float4>(short2 x[], float4 y[], const int N)
223 {
224 #pragma unroll
225  for (int j = 0; j < N / 2; j++) {
226  x[2 * j] = make_short2(f2i(y[j].x), f2i(y[j].y));
227  x[2 * j + 1] = make_short2(f2i(y[j].z), f2i(y[j].w));
228  }
229 }
230 
231 template <> __device__ inline void convert<short4, double2>(short4 x[], double2 y[], const int N)
232 {
233 #pragma unroll
234  for (int j = 0; j < N; j++)
235  x[j] = make_short4(d2i(y[2 * j].x), d2i(y[2 * j].y), d2i(y[2 * j + 1].x), d2i(y[2 * j + 1].y));
236 }
237 
238 template <> __device__ inline void convert<double2, short4>(double2 x[], short4 y[], const int N)
239 {
240 #pragma unroll
241  for (int j = 0; j < N / 2; j++) {
242  x[2 * j] = make_double2(y[j].x, y[j].y);
243  x[2 * j + 1] = make_double2(y[j].z, y[j].w);
244  }
245 }
246 
247 template <> __device__ inline void convert<double4, short2>(double4 x[], short2 y[], const int N)
248 {
249 #pragma unroll
250  for (int j = 0; j < N; j++) x[j] = make_double4(y[2 * j].x, y[2 * j].y, y[2 * j + 1].x, y[2 * j + 1].y);
251 }
252 
253 template <> __device__ inline void convert<short2, double4>(short2 x[], double4 y[], const int N)
254 {
255 #pragma unroll
256  for (int j = 0; j < N / 2; j++) {
257  x[2 * j] = make_short2(d2i(y[j].x), d2i(y[j].y));
258  x[2 * j + 1] = make_short2(d2i(y[j].z), d2i(y[j].w));
259  }
260 }
261 
262 template <> __device__ inline void convert<float4, double2>(float4 x[], double2 y[], const int N)
263 {
264 #pragma unroll
265  for (int j = 0; j < N; j++) x[j] = make_float4(y[2 * j].x, y[2 * j].y, y[2 * j + 1].x, y[2 * j + 1].y);
266 }
267 
268 template <> __device__ inline void convert<double2, float4>(double2 x[], float4 y[], const int N)
269 {
270 #pragma unroll
271  for (int j = 0; j < N / 2; j++) {
272  x[2 * j] = make_double2(y[j].x, y[j].y);
273  x[2 * j + 1] = make_double2(y[j].z, y[j].w);
274  }
275 }
276 
277 template <> __device__ inline void convert<double4, float2>(double4 x[], float2 y[], const int N)
278 {
279 #pragma unroll
280  for (int j = 0; j < N; j++) x[j] = make_double4(y[2 * j].x, y[2 * j].y, y[2 * j + 1].x, y[2 * j + 1].y);
281 }
282 
283 template <> __device__ inline void convert<float2, double4>(float2 x[], double4 y[], const int N)
284 {
285 #pragma unroll
286  for (int j = 0; j < N / 2; j++) {
287  x[2 * j] = make_float2(y[j].x, y[j].y);
288  x[2 * j + 1] = make_float2(y[j].z, y[j].w);
289  }
290 }
291 
292 } // namespace quda
int vecLength< float2 >()
Definition: convert.h:25
int vecLength< short4 >()
Definition: convert.h:29
int vecLength< double2 >()
Definition: convert.h:26
int vecLength< char4 >()
Definition: convert.h:28
__device__ __host__ int d2i(double d)
Definition: convert.h:104
int vecLength< char2 >()
Definition: convert.h:23
int vecLength< double4 >()
Definition: convert.h:31
int vecLength< float >()
Definition: convert.h:20
int vecLength< short >()
Definition: convert.h:19
__device__ void convert< float4, double2 >(float4 x[], double2 y[], const int N)
Definition: convert.h:262
__device__ void convert< float2, double4 >(float2 x[], double4 y[], const int N)
Definition: convert.h:283
__host__ __device__ double s2d(short a)
Definition: convert.h:35
int vecLength< float4 >()
Definition: convert.h:30
__device__ void convert< double2, double4 >(double2 x[], double4 y[], const int N)
Definition: convert.h:176
__device__ void convert< double2, float4 >(double2 x[], float4 y[], const int N)
Definition: convert.h:268
__device__ void convert< double4, float2 >(double4 x[], float2 y[], const int N)
Definition: convert.h:277
__device__ void convert< short2, double4 >(short2 x[], double4 y[], const int N)
Definition: convert.h:253
__device__ void convert< short2, float4 >(short2 x[], float4 y[], const int N)
Definition: convert.h:222
__device__ void convert< double2, short4 >(double2 x[], short4 y[], const int N)
Definition: convert.h:238
__device__ void convert< double4, double2 >(double4 x[], double2 y[], const int N)
Definition: convert.h:170
__device__ void copyFloatN(FloatN &a, const FloatN &b)
Definition: convert.h:61
__device__ void convert< float2, float4 >(float2 x[], float4 y[], const int N)
Definition: convert.h:191
__device__ void convert< short4, float2 >(short4 x[], float2 y[], const int N)
Definition: convert.h:200
__device__ void convert< float2, short2 >(float2 x[], short2 y[], const int N)
Definition: convert.h:156
__device__ void convert< float4, short4 >(float4 x[], short4 y[], const int N)
Definition: convert.h:162
int vecLength< char >()
Definition: convert.h:18
__device__ void convert< float4, float2 >(float4 x[], float2 y[], const int N)
Definition: convert.h:185
__host__ __device__ float s2f(short a)
Definition: convert.h:34
__device__ void convert< float4, short2 >(float4 x[], short2 y[], const int N)
Definition: convert.h:216
__device__ void convert< short4, double2 >(short4 x[], double2 y[], const int N)
Definition: convert.h:231
int vecLength< short2 >()
Definition: convert.h:24
int vecLength< double >()
Definition: convert.h:21
__host__ __device__ double c2d(char a)
Definition: convert.h:39
__device__ void convert< float2, short4 >(float2 x[], short4 y[], const int N)
Definition: convert.h:207
__device__ void convert< double4, short2 >(double4 x[], short2 y[], const int N)
Definition: convert.h:247
__device__ __host__ int f2i(float f)
Definition: convert.h:93
__host__ __device__ float c2f(char a)
Definition: convert.h:38
int vecLength()
Definition: convert.h:16
__device__ void convert(OutputType x[], InputType y[], const int N)
Definition: convert.h:149