QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
texture.h
Go to the documentation of this file.
1 #ifndef _TEXTURE_H
2 #define _TEXTURE_H
3 
4 #include <quda_internal.h>
5 #include <color_spinor_field.h>
6 #include <convert.h>
7 
8 //namespace quda {
9 
10 #ifdef USE_TEXTURE_OBJECTS
11 
12 template<typename OutputType, typename InputType>
13 class Texture {
14 
15 private:
16 #ifndef DIRECT_ACCESS_BLAS
17  cudaTextureObject_t spinor;
18 #else
19  const InputType *spinor; // used when textures are disabled
20 #endif
21 
22 public:
23  Texture() : spinor(0) { }
24 #ifndef DIRECT_ACCESS_BLAS
25  Texture(const cudaColorSpinorField *x) : spinor(x->Tex()) { }
26 #else
27  Texture(const cudaColorSpinorField *x) : spinor((InputType*)(x->V())) { }
28 #endif
29  Texture(const Texture &tex) : spinor(tex.spinor) { }
30  ~Texture() { }
31 
32  Texture& operator=(const Texture &tex) {
33  if (this != &tex) spinor = tex.spinor;
34  return *this;
35  }
36 
37 #ifndef DIRECT_ACCESS_BLAS
38  __device__ inline OutputType fetch(unsigned int idx)
39  { return tex1Dfetch<OutputType>(spinor, idx); }
40 #else
41  __device__ inline OutputType fetch(unsigned int idx)
42  { OutputType out; copyFloatN(out, spinor[idx]); return out; }
43 #endif
44 
45  __device__ inline OutputType operator[](unsigned int idx) { return fetch(idx); }
46 };
47 
48 #ifndef DIRECT_ACCESS_BLAS
49 __device__ inline double2 fetch_double2(int4 v)
50 { return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); }
51 
52 template<> __device__ inline double2 Texture<double2,double2>::fetch(unsigned int idx)
53 { double2 out; copyFloatN(out, fetch_double2(tex1Dfetch<int4>(spinor, idx))); return out; }
54 
55 template<> __device__ inline float2 Texture<float2,double2>::fetch(unsigned int idx)
56 { float2 out; copyFloatN(out, fetch_double2(tex1Dfetch<int4>(spinor, idx))); return out; }
57 #endif
58 
59 #else
60 
61 // legacy Texture references
62 
63 #if (__COMPUTE_CAPABILITY__ >= 130)
64  __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
65  {
66  int4 v = tex1Dfetch(t,i);
67  return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
68  }
69 #else
70  __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
71  {
72  // do nothing
73  return make_double2(0.0, 0.0);
74  }
75 #endif
76 
77 #define MAX_TEXELS (1<<27)
78 
79  template<typename OutputType, typename InputType, int tex_id>
80  class Texture {
81  private:
82 #ifdef DIRECT_ACCESS_BLAS
83  const InputType *spinor; // used when textures are disabled
84  size_t bytes;
85 #endif
86  static bool bound;
87  static int count;
88 
89  public:
91 #ifdef DIRECT_ACCESS_BLAS
92  : spinor(0), bytes(0)
93 #endif
94  { count++; }
95 
96  Texture(const cudaColorSpinorField *x)
97 #ifdef DIRECT_ACCESS_BLAS
98  : spinor((const InputType*)x->V()), bytes(x->Bytes())
99 #endif
100  {
101  // only bind if bytes > 0
102  if (x->Bytes()) { bind((const InputType*)x->V(), x->Bytes()); bound = true; }
103  count++;
104  }
105 
106  Texture(const Texture &tex)
107 #ifdef DIRECT_ACCESS_BLAS
108  : spinor(tex.spinor), bytes(tex.bytes)
109 #endif
110  { count++; }
111 
112  ~Texture() { if (bound && !--count) { unbind(); bound = false;} }
113 
114  Texture& operator=(const Texture &tex) {
115 #ifdef DIRECT_ACCESS_BLAS
116  spinor = tex.spinor;
117  bytes = tex.bytes;
118 #endif
119  return *this;
120  }
121 
122  inline void bind(const InputType*, size_t bytes){ /*errorQuda("Texture id is out of range");*/ }
123  inline void unbind() { /*errorQuda("Texture id is out of range");*/ }
124 
125  //default should only be called if a tex_id is out of range
126  __device__ inline OutputType fetch(unsigned int idx) { OutputType x; x.x =0; return x; };
127  __device__ inline OutputType operator[](unsigned int idx) { return fetch(idx); }
128  };
129 
130  template<typename OutputType, typename InputType, int tex_id>
132 
133  template<typename OutputType, typename InputType, int tex_id>
135 
136 #define DECL_TEX(id) \
137  texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_##id; \
138  texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_##id; \
139  texture<float,1> tex_float_##id; \
140  texture<float2,1> tex_float2_##id; \
141  texture<float4,1> tex_float4_##id; \
142  texture<int4,1> tex_double2_##id;
143 
144 
145 #define DEF_BIND_UNBIND(outtype, intype, id) \
146  template<> inline void Texture<outtype,intype,id>::bind(const intype *ptr, size_t bytes) \
147  { cudaBindTexture(0,tex_##intype##_##id, ptr, bytes); } \
148  template<> inline void Texture<outtype,intype,id>::unbind() { cudaUnbindTexture(tex_##intype##_##id); }
149 
150 
151 #define DEF_FETCH_TEX(outtype, intype, id) \
152  template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
153  { return tex1Dfetch(tex_##intype##_##id,idx); }
154 
155 
156 #define DEF_FETCH_DIRECT(outtype, intype, id) \
157  template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
158  { outtype out; copyFloatN(out, spinor[idx]); return out; }
159 
160 
161 #if defined(DIRECT_ACCESS_BLAS)
162 #define DEF_FETCH DEF_FETCH_DIRECT
163 #else
164 #define DEF_FETCH DEF_FETCH_TEX
165 #endif
166 
167 
168 #if defined(DIRECT_ACCESS_BLAS) || defined(FERMI_NO_DBLE_TEX)
169 #define DEF_FETCH_DBLE DEF_FETCH_DIRECT
170 #else
171 #define DEF_FETCH_DBLE(outtype, intype, id) \
172  template<> __device__ inline outtype Texture<outtype,double2,id>::fetch(unsigned int idx) \
173  { outtype out; copyFloatN(out, fetch_double2(tex_double2_##id,idx)); return out; }
174 #endif
175 
176 
177 #define DEF_BIND_UNBIND_FETCH(outtype, intype, id) \
178  DEF_BIND_UNBIND(outtype, intype, id) \
179  DEF_FETCH(outtype, intype, id)
180 
181 
182 #define DEF_ALL(id) \
183  DECL_TEX(id) \
184  DEF_BIND_UNBIND_FETCH(float2, short2, id) \
185  DEF_BIND_UNBIND_FETCH(float4, short4, id) \
186  DEF_BIND_UNBIND_FETCH(float, float, id) \
187  DEF_BIND_UNBIND_FETCH(float2, float2, id) \
188  DEF_BIND_UNBIND_FETCH(float4, float4, id) \
189  DEF_BIND_UNBIND(double2, double2, id) \
190  DEF_BIND_UNBIND(float2, double2, id) \
191  DEF_FETCH_DBLE(double2, double2, id) \
192  DEF_FETCH_DBLE(float2, double2, id)
193 
194 
195  // Declare the textures and define the member functions of the corresponding templated classes.
196  DEF_ALL(0)
197  DEF_ALL(1)
198  DEF_ALL(2)
199  DEF_ALL(3)
200  DEF_ALL(4)
201 
202 #define MAX_TEX_ID 4
203 
204 
205 #undef DECL_TEX
206 #undef DEF_BIND_UNBIND
207 #undef DEF_FETCH_DIRECT
208 #undef DEF_FETCH_TEX
209 #undef DEF_FETCH
210 #undef DEF_FETCH_DBLE
211 #undef DEF_BIND_UNBIND_FETCH
212 #undef DEF_ALL
213 
214 #endif // USE_TEXTURE_OBJECTS
215 
216 
228  template <typename RegType, typename InterType, typename StoreType>
229  void checkTypes() {
230 
231  const size_t reg_size = sizeof(((RegType*)0)->x);
232  const size_t inter_size = sizeof(((InterType*)0)->x);
233  const size_t store_size = sizeof(((StoreType*)0)->x);
234 
235  if (reg_size != inter_size && store_size != 2 && inter_size != 4)
236  errorQuda("Precision of register (%lu) and intermediate (%lu) types must match\n",
237  (unsigned long)reg_size, (unsigned long)inter_size);
238 
239  if (vecLength<InterType>() != vecLength<StoreType>()) {
240  errorQuda("Vector lengths intermediate and register types must match\n");
241  }
242 
243  if (vecLength<RegType>() == 0) errorQuda("Vector type not supported\n");
244  if (vecLength<InterType>() == 0) errorQuda("Vector type not supported\n");
245  if (vecLength<StoreType>() == 0) errorQuda("Vector type not supported\n");
246 
247  }
248 
249  template <typename FloatN, int M>
250  __device__ inline float store_norm(float *norm, FloatN x[M], int i) {
251  float c[M];
252 #pragma unroll
253  for (int j=0; j<M; j++) c[j] = max_fabs(x[j]);
254 #pragma unroll
255  for (int j=1; j<M; j++) c[0] = fmaxf(c[j],c[0]);
256  norm[i] = c[0];
257  return __fdividef(MAX_SHORT, c[0]);
258  }
259 
260  // the number of elements per virtual register
261 #define REG_LENGTH (sizeof(RegType) / sizeof(((RegType*)0)->x))
262 
263 // whether the type is a shortN vector
264 #define IS_SHORT(type) (sizeof( ((type*)0)->x ) == sizeof(short) )
265 
274 template <typename RegType, typename InterType, typename StoreType, int N, int write, int tex_id=-1>
275  class Spinor {
276 
277  private:
278  StoreType *spinor;
279 #ifdef USE_TEXTURE_OBJECTS // texture objects
281 #else
283 #endif
284  float *norm; // direct reads for norm
285  int stride;
286 
287  public:
289  : spinor(0), tex(), norm(0), stride(0) { } // default constructor
290 
291  Spinor(const cudaColorSpinorField &x)
292  : spinor((StoreType*)x.V()), tex(&x), norm((float*)x.Norm()),
293  stride(x.Length()/(N*REG_LENGTH)) { checkTypes<RegType,InterType,StoreType>(); }
294 
295  Spinor(const Spinor &st)
296  : spinor(st.spinor), tex(st.tex), norm(st.norm), stride(st.stride) { }
297 
298  Spinor(StoreType* spinor, float* norm, int stride)
299  : spinor(spinor), norm(norm), stride(stride) { checkTypes<RegType, InterType, StoreType>(); }
300 
301  Spinor& operator=(const Spinor &src) {
302  if (&src != this) {
303  spinor = src.spinor;
304  tex = src.tex;
305  norm = src.norm;
306  stride = src.stride;
307  }
308  return *this;
309  }
310 
311  ~Spinor() { } /* on g80 / gt200 this must not be virtual */
312 
313  __device__ inline void load(RegType x[], const int i) {
314  // load data into registers first using the storage order
315  const int M = (N * sizeof(RegType)) / sizeof(InterType);
316  InterType y[M];
317 
318  // If we are using tex references, then we can only use the predeclared texture ids
319 #ifndef USE_TEXTURE_OBJECTS
320  if (tex_id >= 0 && tex_id <= MAX_TEX_ID) {
321 #endif
322  // half precision types
323  if ( IS_SHORT(StoreType) ) {
324  float xN = norm[i];
325 #pragma unroll
326  for (int j=0; j<M; j++) y[j] = xN*tex[i + j*stride];
327  } else { // other types
328 #pragma unroll
329  for (int j=0; j<M; j++) copyFloatN(y[j], tex[i + j*stride]);
330  }
331 #ifndef USE_TEXTURE_OBJECTS
332  } else { // default load when out of tex_id range
333 
334  if ( IS_SHORT(StoreType) ) {
335  float xN = norm[i];
336 #pragma unroll
337  for (int j=0; j<M; j++) {
338  copyFloatN(y[j], spinor[i + j*stride]);
339  y[j] *= xN;
340  }
341  } else { // other types
342 #pragma unroll
343  for (int j=0; j<M; j++) copyFloatN(y[j],spinor[i + j*stride]);
344  }
345  }
346 #endif
347 
348  // now convert into desired register order
349  convert<RegType, InterType>(x, y, N);
350  }
351 
352  // default store used for simple fields
353  __device__ inline void save(RegType x[], int i) {
354  if (write) {
355  const int M = (N * sizeof(RegType)) / sizeof(InterType);
356  InterType y[M];
357  convert<InterType, RegType>(y, x, M);
358 
359  if ( IS_SHORT(StoreType) ) {
360  float C = store_norm<InterType, M>(norm, y, i);
361 #pragma unroll
362  for (int j=0; j<M; j++) copyFloatN(spinor[i+j*stride], C*y[j]);
363  } else {
364 #pragma unroll
365  for (int j=0; j<M; j++) copyFloatN(spinor[i+j*stride], y[j]);
366  }
367  }
368  }
369 
370  // used to backup the field to the host
371  void save(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes) {
372  if (write) {
373  *spinor_h = new char[bytes];
374  cudaMemcpy(*spinor_h, spinor, bytes, cudaMemcpyDeviceToHost);
375  if (norm_bytes > 0) {
376  *norm_h = new char[norm_bytes];
377  cudaMemcpy(*norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
378  }
379  checkCudaError();
380  }
381  }
382 
383  // restore the field from the host
384  void load(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes) {
385  if (write) {
386  cudaMemcpy(spinor, *spinor_h, bytes, cudaMemcpyHostToDevice);
387  if (norm_bytes > 0) {
388  cudaMemcpy(norm, *norm_h, norm_bytes, cudaMemcpyHostToDevice);
389  delete []*norm_h;
390  *norm_h = 0;
391  }
392  delete []*spinor_h;
393  *spinor_h = 0;
394  checkCudaError();
395  }
396  }
397 
398  void* V() { return (void*)spinor; }
399  float* Norm() { return norm; }
400 
403  if (sizeof(((StoreType*)0)->x) == sizeof(double)) precision = QUDA_DOUBLE_PRECISION;
404  else if (sizeof(((StoreType*)0)->x) == sizeof(float)) precision = QUDA_SINGLE_PRECISION;
405  else if (sizeof(((StoreType*)0)->x) == sizeof(short)) precision = QUDA_HALF_PRECISION;
406  else errorQuda("Unknown precision type\n");
407  return precision;
408  }
409 
410  int Stride() { return stride; }
411  };
412 
413 //} // namespace quda
414 
415 #ifndef USE_TEXTURE_OBJECTS
416 #undef MAX_TEX_ID
417 #endif
418 
419 #endif // _TEXTURE_H