QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
convert.h
Go to the documentation of this file.
1 #ifndef _CONVERT_H
2 #define _CONVERT_H
3 
12 #include <quda_internal.h> // for MAX_SHORT
13 
14 template <typename type> int vecLength() { return 0; }
15 
16 template<> int vecLength<short>() { return 1; }
17 template<> int vecLength<float>() { return 1; }
18 template<> int vecLength<double>() { return 1; }
19 
20 template<> int vecLength<short2>() { return 2; }
21 template<> int vecLength<float2>() { return 2; }
22 template<> int vecLength<double2>() { return 2; }
23 
24 template<> int vecLength<short4>() { return 4; }
25 template<> int vecLength<float4>() { return 4; }
26 template<> int vecLength<double4>() { return 4; }
27 
28 // MAX_SHORT 32767
29 #define MAX_SHORT_INV 3.051850948e-5
30 static inline __device__ float s2f(const short &a) { return static_cast<float>(a) * MAX_SHORT_INV; }
31 
32 template <typename FloatN>
33 __device__ inline void copyFloatN(FloatN &a, const FloatN &b) { a = b; }
34 
35 // This is emulating the texture normalized return
36 __device__ inline void copyFloatN(float2 &a, const short2 &b) { a = make_float2(s2f(b.x), s2f(b.y)); }
37 __device__ inline void copyFloatN(float4 &a, const short4 &b) { a = make_float4(s2f(b.x), s2f(b.y), s2f(b.z), s2f(b.w)); }
38 
39 __device__ inline void copyFloatN(float2 &a, const double2 &b) { a = make_float2(b.x, b.y); }
40 __device__ inline void copyFloatN(double2 &a, const float2 &b) { a = make_double2(b.x, b.y); }
41 __device__ inline void copyFloatN(float4 &a, const double4 &b) { a = make_float4(b.x, b.y, b.z, b.w); }
42 __device__ inline void copyFloatN(double4 &a, const float4 &b) { a = make_double4(b.x, b.y, b.z, b.w); }
43 
44 /* Here we assume that the input data has already been normalized and shifted. */
45 __device__ inline void copyFloatN(short2 &a, const float2 &b) { a = make_short2(b.x, b.y); }
46 __device__ inline void copyFloatN(short4 &a, const float4 &b) { a = make_short4(b.x, b.y, b.z, b.w); }
47 __device__ inline void copyFloatN(short2 &a, const double2 &b) { a = make_short2(b.x, b.y); }
48 __device__ inline void copyFloatN(short4 &a, const double4 &b) { a = make_short4(b.x, b.y, b.z, b.w); }
49 
50 
62 template<typename OutputType, typename InputType>
63 __device__ inline void convert(OutputType x[], InputType y[], const int N) {
64  // default is one-2-one conversion, e.g., matching vector lengths and precisions
65 #pragma unroll
66  for (int j=0; j<N; j++) copyFloatN(x[j], y[j]);
67 }
68 
69 template<> __device__ inline void convert<float2,short2>(float2 x[], short2 y[], const int N) {
70 #pragma unroll
71  for (int j=0; j<N; j++) x[j] = make_float2(y[j].x, y[j].y);
72 }
73 
74 template<> __device__ inline void convert<float4,short4>(float4 x[], short4 y[], const int N) {
75 #pragma unroll
76  for (int j=0; j<N; j++) x[j] = make_float4(y[j].x, y[j].y, y[j].z, y[j].w);
77 }
78 
79 // 4 <-> 2 vector conversion
80 
81 template<> __device__ inline void convert<double4,double2>(double4 x[], double2 y[], const int N) {
82 #pragma unroll
83  for (int j=0; j<N; j++) x[j] = make_double4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y);
84 }
85 
86 template<> __device__ inline void convert<double2,double4>(double2 x[], double4 y[], const int N) {
87 #pragma unroll
88  for (int j=0; j<N/2; j++) {
89  x[2*j] = make_double2(y[j].x, y[j].y);
90  x[2*j+1] = make_double2(y[j].z, y[j].w);
91  }
92 }
93 
94 template<> __device__ inline void convert<float4,float2>(float4 x[], float2 y[], const int N) {
95 #pragma unroll
96  for (int j=0; j<N; j++) x[j] = make_float4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y);
97 }
98 
99 template<> __device__ inline void convert<float2,float4>(float2 x[], float4 y[], const int N) {
100 #pragma unroll
101  for (int j=0; j<N/2; j++) {
102  x[2*j] = make_float2(y[j].x, y[j].y);
103  x[2*j+1] = make_float2(y[j].z, y[j].w);
104  }
105 }
106 
107 template<> __device__ inline void convert<short4,float2>(short4 x[], float2 y[], const int N) {
108 #pragma unroll
109  for (int j=0; j<N; j++) x[j] = make_short4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y);
110 }
111 
112 template<> __device__ inline void convert<float2,short4>(float2 x[], short4 y[], const int N) {
113 #pragma unroll
114  for (int j=0; j<N/2; j++) {
115  x[2*j] = make_float2(y[j].x, y[j].y);
116  x[2*j+1] = make_float2(y[j].z, y[j].w);
117  }
118 }
119 
120 template<> __device__ inline void convert<float4,short2>(float4 x[], short2 y[], const int N) {
121 #pragma unroll
122  for (int j=0; j<N; j++) x[j] = make_float4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y);
123 }
124 
125 template<> __device__ inline void convert<short2,float4>(short2 x[], float4 y[], const int N) {
126 #pragma unroll
127  for (int j=0; j<N/2; j++) {
128  x[2*j] = make_short2(y[j].x, y[j].y);
129  x[2*j+1] = make_short2(y[j].z, y[j].w);
130  }
131 }
132 
133 template<> __device__ inline void convert<short4,double2>(short4 x[], double2 y[], const int N) {
134 #pragma unroll
135  for (int j=0; j<N; j++) x[j] = make_short4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y);
136 }
137 
138 template<> __device__ inline void convert<double2,short4>(double2 x[], short4 y[], const int N) {
139 #pragma unroll
140  for (int j=0; j<N/2; j++) {
141  x[2*j] = make_double2(y[j].x, y[j].y);
142  x[2*j+1] = make_double2(y[j].z, y[j].w);
143  }
144 }
145 
146 template<> __device__ inline void convert<double4,short2>(double4 x[], short2 y[], const int N) {
147 #pragma unroll
148  for (int j=0; j<N; j++) x[j] = make_double4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y);
149 }
150 
151 template<> __device__ inline void convert<short2,double4>(short2 x[], double4 y[], const int N) {
152 #pragma unroll
153  for (int j=0; j<N/2; j++) {
154  x[2*j] = make_short2(y[j].x, y[j].y);
155  x[2*j+1] = make_short2(y[j].z, y[j].w);
156  }
157 }
158 
159 template<> __device__ inline void convert<float4,double2>(float4 x[], double2 y[], const int N) {
160 #pragma unroll
161  for (int j=0; j<N; j++) x[j] = make_float4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y);
162 }
163 
164 template<> __device__ inline void convert<double2,float4>(double2 x[], float4 y[], const int N) {
165 #pragma unroll
166  for (int j=0; j<N/2; j++) {
167  x[2*j] = make_double2(y[j].x, y[j].y);
168  x[2*j+1] = make_double2(y[j].z, y[j].w);
169  }
170 }
171 
172 template<> __device__ inline void convert<double4,float2>(double4 x[], float2 y[], const int N) {
173 #pragma unroll
174  for (int j=0; j<N; j++) x[j] = make_double4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y);
175 }
176 
177 template<> __device__ inline void convert<float2,double4>(float2 x[], double4 y[], const int N) {
178 #pragma unroll
179  for (int j=0; j<N/2; j++) {
180  x[2*j] = make_float2(y[j].x, y[j].y);
181  x[2*j+1] = make_float2(y[j].z, y[j].w);
182  }
183 }
184 
185 #endif // _CONVERT_H