QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hw_quda.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 
4 #include "quda.h"
5 #include "hw_quda.h"
6 #include "util_quda.h"
7 
8 #define hwSiteSize 12
9 
10 static ParityHw allocateParityHw(int *X, QudaPrecision precision)
11 {
12  ParityHw ret;
13 
14  ret.precision = precision;
15  ret.X[0] = X[0]/2;
16  ret.volume = X[0]/2;
17  for (int d=1; d<4; d++) {
18  ret.X[d] = X[d];
19  ret.volume *= X[d];
20  }
21  ret.Nc = 3;
22  ret.Ns = 2;
23  ret.length = ret.volume*ret.Nc*ret.Ns*2;
24 
25  if (precision == QUDA_DOUBLE_PRECISION) ret.bytes = ret.length*sizeof(double);
26  else if (precision == QUDA_SINGLE_PRECISION) ret.bytes = ret.length*sizeof(float);
27  else ret.bytes = ret.length*sizeof(short);
28 
29  ret.data = device_malloc(ret.bytes);
30  cudaMemset(ret.data, 0, ret.bytes);
31 
32  if (precision == QUDA_HALF_PRECISION) {
33  errorQuda("Half precision not supported at present"); //FIXME
34  //ret.dataNorm = device_malloc(2*ret.bytes/spinorSiteSize);
35  }
36  return ret;
37 }
38 
39 
40 FullHw
41 createHwQuda(int *X, QudaPrecision precision)
42 {
43  FullHw ret;
44  ret.even = allocateParityHw(X, precision);
45  ret.odd = allocateParityHw(X, precision);
46  return ret;
47 }
48 
49 
50 static void freeParityHwQuda(ParityHw parity_hw)
51 {
52  device_free(parity_hw.data);
53  if (parity_hw.precision == QUDA_HALF_PRECISION){
54  device_free(parity_hw.dataNorm);
55  }
56  parity_hw.data = NULL;
57  parity_hw.dataNorm = NULL;
58 }
59 
60 
61 void freeHwQuda(FullHw hw)
62 {
63  freeParityHwQuda(hw.even);
64  freeParityHwQuda(hw.odd);
65 }
66 
67 
68 template <typename Float>
69 static inline void packHwVector(float4* a, Float *b, int Vh)
70 {
71  a[0*Vh].x = b[0];
72  a[0*Vh].y = b[1];
73  a[0*Vh].z = b[2];
74  a[0*Vh].w = b[3];
75 
76  a[1*Vh].x = b[4];
77  a[1*Vh].y = b[5];
78  a[1*Vh].z = b[6];
79  a[1*Vh].w = b[7];
80 
81  a[2*Vh].x = b[8];
82  a[2*Vh].y = b[9];
83  a[2*Vh].z = b[10];
84  a[2*Vh].w = b[11];
85 
86 }
87 
88 template <typename Float>
89 static inline void packHwVector(float2* a, Float *b, int Vh)
90 {
91  a[0*Vh].x = b[0];
92  a[0*Vh].y = b[1];
93 
94  a[1*Vh].x = b[2];
95  a[1*Vh].y = b[3];
96 
97  a[2*Vh].x = b[4];
98  a[2*Vh].y = b[5];
99 
100  a[3*Vh].x = b[6];
101  a[3*Vh].y = b[7];
102 
103  a[4*Vh].x = b[8];
104  a[4*Vh].y = b[9];
105 
106  a[5*Vh].x = b[10];
107  a[5*Vh].y = b[11];
108 }
109 
110 template <typename Float>
111 static inline void packHwVector(double2* a, Float *b, int Vh)
112 {
113  a[0*Vh].x = b[0];
114  a[0*Vh].y = b[1];
115 
116  a[1*Vh].x = b[2];
117  a[1*Vh].y = b[3];
118 
119  a[2*Vh].x = b[4];
120  a[2*Vh].y = b[5];
121 
122  a[3*Vh].x = b[6];
123  a[3*Vh].y = b[7];
124 
125  a[4*Vh].x = b[8];
126  a[4*Vh].y = b[9];
127 
128  a[5*Vh].x = b[10];
129  a[5*Vh].y = b[11];
130 }
131 
132 
133 template <typename Float>
134 static inline void unpackHwVector(Float *a, float4 *b, int Vh)
135 {
136  a[0] = a[0*Vh].x;
137  a[1] = a[0*Vh].y;
138  a[2] = a[0*Vh].z;
139  a[3] = a[0*Vh].t;
140 
141  a[4] = a[1*Vh].x;
142  a[5] = a[1*Vh].y;
143  a[6] = a[1*Vh].z;
144  a[7] = a[1*Vh].t;
145 
146  a[8] = a[2*Vh].x;
147  a[9] = a[2*Vh].y;
148  a[10] = a[2*Vh].z;
149  a[11] = a[2*Vh].t;
150 }
151 
152 
153 template <typename Float>
154 static inline void unpackHwVector(Float *a, float2 *b, int Vh)
155 {
156  a[0] = b[0*Vh].x;
157  a[1] = b[0*Vh].y;
158 
159  a[2] = b[1*Vh].x;
160  a[3] = b[1*Vh].y;
161 
162  a[4] = b[2*Vh].x;
163  a[5] = b[2*Vh].y;
164 
165  a[6] = b[3*Vh].x;
166  a[7] = b[3*Vh].y;
167 
168  a[8] = b[4*Vh].x;
169  a[9] = b[4*Vh].y;
170 
171  a[10] = b[5*Vh].x;
172  a[11] = b[5*Vh].y;
173 
174 }
175 
176 template <typename Float>
177 static inline void unpackHwVector(Float *a, double2 *b, int Vh)
178 {
179  a[0] = b[0*Vh].x;
180  a[1] = b[0*Vh].y;
181 
182  a[2] = b[1*Vh].x;
183  a[3] = b[1*Vh].y;
184 
185  a[4] = b[2*Vh].x;
186  a[5] = b[2*Vh].y;
187 
188  a[6] = b[3*Vh].x;
189  a[7] = b[3*Vh].y;
190 
191  a[8] = b[4*Vh].x;
192  a[9] = b[4*Vh].y;
193 
194  a[10] = b[5*Vh].x;
195  a[11] = b[5*Vh].y;
196 
197 }
198 
199 template <typename Float, typename FloatN>
200 void packParityHw(FloatN *res, Float *hw, int Vh)
201 {
202  for (int i = 0; i < Vh; i++) {
203  packHwVector(res+i, hw+hwSiteSize*i, Vh);
204  }
205 }
206 
207 template <typename Float, typename FloatN>
208 static void unpackParityHw(Float *res, FloatN *hwPacked, int Vh) {
209 
210  for (int i = 0; i < Vh; i++) {
211  unpackHwVector(res+i*hwSiteSize, hwPacked+i, Vh);
212  }
213 }
214 
215 
216 
217 void static loadParityHw(ParityHw ret, void *hw, QudaPrecision cpu_prec)
218 {
219  if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
220  errorQuda("CUDA double precision requires CPU double precision");
221  }
222 
223  if (ret.precision != QUDA_HALF_PRECISION) {
224 
225  void *packedHw1 = pinned_malloc(ret.bytes);
226 
227  if (ret.precision == QUDA_DOUBLE_PRECISION) {
228  packParityHw((double2*)packedHw1, (double*)hw, ret.volume);
229  } else {
230  if (cpu_prec == QUDA_DOUBLE_PRECISION) {
231  packParityHw((float2*)packedHw1, (double*)hw, ret.volume);
232  } else {
233  packParityHw((float2*)packedHw1, (float*)hw, ret.volume);
234  }
235  }
236  cudaMemcpy(ret.data, packedHw1, ret.bytes, cudaMemcpyHostToDevice);
237  host_free(packedHw1);
238 
239  } else {
240  //half precision
241  /*
242  ParityHw tmp = allocateParityHw(ret.X, QUDA_SINGLE_PRECISION);
243  loadParityHw(tmp, hw, cpu_prec, dirac_order);
244  copyCuda(ret, tmp);
245  freeParityHw(tmp);
246  */
247  }
248 }
249 
250 
251 void loadHwToGPU(FullHw ret, void *hw, QudaPrecision cpu_prec)
252 {
253  void *hw_odd;
254  if (cpu_prec == QUDA_SINGLE_PRECISION){
255  hw_odd = ((float*)hw) + ret.even.length;
256  }else{
257  hw_odd = ((double*)hw) + ret.even.length;
258  }
259 
260  loadParityHw(ret.even, hw, cpu_prec);
261  loadParityHw(ret.odd, hw_odd, cpu_prec);
262 
263 }
264 
265 
266 static void retrieveParityHw(void *res, ParityHw hw, QudaPrecision cpu_prec)
267 {
268  if (hw.precision != QUDA_HALF_PRECISION) {
269 
270  void *packedHw1 = pinned_malloc(hw.bytes);
271  cudaMemcpy(packedHw1, hw.data, hw.bytes, cudaMemcpyDeviceToHost);
272 
273  if (hw.precision == QUDA_DOUBLE_PRECISION) {
274  unpackParityHw((double*)res, (double2*)packedHw1, hw.volume);
275  } else {
276  if (cpu_prec == QUDA_DOUBLE_PRECISION){
277  unpackParityHw((double*)res, (float2*)packedHw1, hw.volume);
278  } else {
279  unpackParityHw((float*)res, (float2*)packedHw1, hw.volume);
280  }
281  }
282  host_free(packedHw1);
283 
284  } else {
285  //half precision
286  /*
287  ParityHw tmp = allocateParityHw(hw.X, QUDA_SINGLE_PRECISION);
288  copyCuda(tmp, hw);
289  retrieveParityHw(res, tmp, cpu_prec, dirac_order);
290  freeParityHw(tmp);
291  */
292  }
293 }
294 
295 
296 void
297 retrieveHwField(void *res, FullHw hw, QudaPrecision cpu_prec)
298 {
299  void *res_odd;
300  if (cpu_prec == QUDA_SINGLE_PRECISION) res_odd = (float*)res + hw.even.length;
301  else res_odd = (double*)res + hw.even.length;
302 
303  retrieveParityHw(res, hw.even, cpu_prec);
304  retrieveParityHw(res_odd, hw.odd, cpu_prec);
305 
306 }
307 
308 
309 /*
310 void hwHalfPack(float *c, short *s0, float *f0, int V) {
311 
312  float *f = f0;
313  short *s = s0;
314  for (int i=0; i<24*V; i+=24) {
315  c[i] = sqrt(f[0]*f[0] + f[1]*f[1]);
316  for (int j=0; j<24; j+=2) {
317  float k = sqrt(f[j]*f[j] + f[j+1]*f[j+1]);
318  if (k > c[i]) c[i] = k;
319  }
320 
321  for (int j=0; j<24; j++) s[j] = (short)(MAX_SHORT*f[j]/c[i]);
322  f+=24;
323  s+=24;
324  }
325 
326 }
327 
328 void hwHalfUnpack(float *f0, float *c, short *s0, int V) {
329  float *f = f0;
330  short *s = s0;
331  for (int i=0; i<24*V; i+=24) {
332  for (int j=0; j<24; j++) f[j] = s[j] * (c[i] / MAX_SHORT);
333  f+=24;
334  s+=24;
335  }
336 
337 }
338 */
QudaPrecision precision
Definition: quda_internal.h:56
__constant__ int Vh
void freeHwQuda(FullHw hw)
Definition: hw_quda.cpp:61
#define pinned_malloc(size)
Definition: malloc_quda.h:26
enum QudaPrecision_s QudaPrecision
#define errorQuda(...)
Definition: util_quda.h:73
void retrieveHwField(void *res, FullHw hw, QudaPrecision cpu_prec)
Definition: hw_quda.cpp:297
#define host_free(ptr)
Definition: malloc_quda.h:29
void loadHwToGPU(FullHw ret, void *hw, QudaPrecision cpu_prec)
Definition: hw_quda.cpp:251
QudaPrecision cpu_prec
Definition: dslash_test.cpp:34
size_t bytes
Definition: quda_internal.h:55
FloatingPoint< float > Float
Definition: gtest.h:7350
ParityHw odd
Definition: quda_internal.h:67
void * data
Definition: quda_internal.h:62
#define hwSiteSize
Definition: hw_quda.cpp:8
void packParityHw(FloatN *res, Float *hw, int Vh)
Definition: hw_quda.cpp:200
Main header file for the QUDA library.
ParityHw even
Definition: quda_internal.h:68
#define device_malloc(size)
Definition: malloc_quda.h:24
float * dataNorm
Definition: quda_internal.h:63
FullHw createHwQuda(int *X, QudaPrecision precision)
Definition: hw_quda.cpp:41
int X[QUDA_MAX_DIM]
Definition: quda_internal.h:59
#define device_free(ptr)
Definition: malloc_quda.h:28