QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
texture.h
Go to the documentation of this file.
1 #ifndef _TEXTURE_H
2 #define _TEXTURE_H
3 
4 #include <quda_internal.h>
5 #include <color_spinor_field.h>
6 #include <convert.h>
7 #include <register_traits.h>
8 #include <float_vector.h>
9 
10 //namespace quda {
11 
12 #ifdef USE_TEXTURE_OBJECTS
13 
14 template<typename OutputType, typename InputType>
15 class Texture {
16 
17  typedef typename quda::mapper<InputType>::type RegType;
18 
19 private:
20 #ifndef DIRECT_ACCESS_BLAS
21  cudaTextureObject_t spinor;
22 #else
23  const InputType *spinor; // used when textures are disabled
24 #endif
25 
26 public:
27  Texture() : spinor(0) { }
28 #ifndef DIRECT_ACCESS_BLAS
29  Texture(const cudaColorSpinorField *x) : spinor(x->Tex()) { }
30 #else
31  Texture(const cudaColorSpinorField *x) : spinor((InputType*)(x->V())) { }
32 #endif
33  Texture(const Texture &tex) : spinor(tex.spinor) { }
34  ~Texture() { }
35 
36  Texture& operator=(const Texture &tex) {
37  if (this != &tex) spinor = tex.spinor;
38  return *this;
39  }
40 
41 #ifndef DIRECT_ACCESS_BLAS
42  __device__ inline OutputType fetch(unsigned int idx)
43  {
44  OutputType rtn;
45  copyFloatN(rtn, tex1Dfetch<RegType>(spinor, idx));
46  return rtn;
47  }
48 #else
49  __device__ inline OutputType fetch(unsigned int idx)
50  { OutputType out; copyFloatN(out, spinor[idx]); return out; }
51 #endif
52 
53  __device__ inline OutputType operator[](unsigned int idx) { return fetch(idx); }
54 };
55 
56 #ifndef DIRECT_ACCESS_BLAS
57 __device__ inline double fetch_double(int2 v)
58 { return __hiloint2double(v.y, v.x); }
59 
60 __device__ inline double2 fetch_double2(int4 v)
61 { return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); }
62 
63 template<> __device__ inline double2 Texture<double2,double2>::fetch(unsigned int idx)
64 { double2 out; copyFloatN(out, fetch_double2(tex1Dfetch<int4>(spinor, idx))); return out; }
65 
66 template<> __device__ inline float2 Texture<float2,double2>::fetch(unsigned int idx)
67 { float2 out; copyFloatN(out, fetch_double2(tex1Dfetch<int4>(spinor, idx))); return out; }
68 #endif
69 
70 #else
71 
72 // legacy Texture references
73 
74 #if (__COMPUTE_CAPABILITY__ >= 130)
75 
76  __inline__ __device__ double fetch_double(texture<int2, 1> t, int i)
77  {
78  int2 v = tex1Dfetch(t,i);
79  return __hiloint2double(v.y, v.x);
80  }
81 
82  __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
83  {
84  int4 v = tex1Dfetch(t,i);
85  return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
86  }
87 #else
88  __inline__ __device__ double fetch_double(texture<int2, 1> t, int i){ return 0.0; }
89 
90  __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
91  {
92  // do nothing
93  return make_double2(0.0, 0.0);
94  }
95 #endif
96 
97 #define MAX_TEXELS (1<<27)
98 
99 #define MAX_TEX_ID 4
100 
101 // dynamically keep track of texture references we've already bound to
103 
104 template<typename OutputType, typename InputType, int tex_id>
105  class Texture {
106 
107  private:
108 #ifdef DIRECT_ACCESS_BLAS
109  const InputType *spinor; // used when textures are disabled
110  size_t bytes;
111 #endif
112  static bool bound;
113  static int count;
114 
115  public:
117 #ifdef DIRECT_ACCESS_BLAS
118  : spinor(0), bytes(0)
119 #endif
120  { count++; }
121 
122  Texture(const cudaColorSpinorField *x)
123 #ifdef DIRECT_ACCESS_BLAS
124  : spinor((const InputType*)x->V()), bytes(x->Bytes())
125 #endif
126  {
127  // only bind if bytes > 0
128  if (x->Bytes()) {
129  if (tex_id > 0 && tex_id <= MAX_TEX_ID && tex_id_table[tex_id]) {
130  errorQuda("Already bound to this texture reference");
131  } else {
132  tex_id_table[tex_id] = true;
133  }
134  bind((const InputType*)x->V(), x->Bytes()); bound = true;
135  }
136  count++;
137  }
138 
139  Texture(const Texture &tex)
140 #ifdef DIRECT_ACCESS_BLAS
141  : spinor(tex.spinor), bytes(tex.bytes)
142 #endif
143  { count++; }
144 
145  ~Texture() { if (bound && !--count) {
146  unbind(); bound = false; tex_id_table[tex_id]=false;
147  } }
148 
149  Texture& operator=(const Texture &tex) {
150 #ifdef DIRECT_ACCESS_BLAS
151  spinor = tex.spinor;
152  bytes = tex.bytes;
153 #endif
154  return *this;
155  }
156 
157  inline void bind(const InputType*, size_t bytes){ /*errorQuda("Texture id is out of range");*/ }
158  inline void unbind() { /*errorQuda("Texture id is out of range");*/ }
159 
160  //default should only be called if a tex_id is out of range
161  __device__ inline OutputType fetch(unsigned int idx) { OutputType x; x.x =0; return x; };
162  __device__ inline OutputType operator[](unsigned int idx) { return fetch(idx); }
163  };
164 
165  template<typename OutputType, typename InputType, int tex_id>
167 
168  template<typename OutputType, typename InputType, int tex_id>
170 
171 #define DECL_TEX(id) \
172  texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_##id; \
173  texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_##id; \
174  texture<float,1> tex_float_##id; \
175  texture<float2,1> tex_float2_##id; \
176  texture<float4,1> tex_float4_##id; \
177  texture<int4,1> tex_double2_##id;
178 
179 
180 #define DEF_BIND_UNBIND(outtype, intype, id) \
181  template<> inline void Texture<outtype,intype,id>::bind(const intype *ptr, size_t bytes) \
182  { cudaBindTexture(0,tex_##intype##_##id, ptr, bytes); } \
183  template<> inline void Texture<outtype,intype,id>::unbind() { cudaUnbindTexture(tex_##intype##_##id); }
184 
185 
186 #define DEF_FETCH_TEX(outtype, intype, id) \
187  template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
188  { return tex1Dfetch(tex_##intype##_##id,idx); }
189 
190 
191 #define DEF_FETCH_DIRECT(outtype, intype, id) \
192  template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
193  { outtype out; copyFloatN(out, spinor[idx]); return out; }
194 
195 
196 #if defined(DIRECT_ACCESS_BLAS)
197 #define DEF_FETCH DEF_FETCH_DIRECT
198 #else
199 #define DEF_FETCH DEF_FETCH_TEX
200 #endif
201 
202 
203 #if defined(DIRECT_ACCESS_BLAS) || defined(FERMI_NO_DBLE_TEX)
204 #define DEF_FETCH_DBLE DEF_FETCH_DIRECT
205 #else
206 #define DEF_FETCH_DBLE(outtype, intype, id) \
207  template<> __device__ inline outtype Texture<outtype,double2,id>::fetch(unsigned int idx) \
208  { outtype out; copyFloatN(out, fetch_double2(tex_double2_##id,idx)); return out; }
209 #endif
210 
211 #if defined(DIRECT_ACCESS_BLAS) || defined(FERMI_NO_DBLE_TEX)
212 #define DEF_FETCH_DBLE_MIXED DEF_FETCH_DIRECT
213 #else
214 #define DEF_FETCH_DBLE_MIXED(outtype, intype, id) \
215  template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
216  { outtype out; copyFloatN(out, tex1Dfetch(tex_##intype##_##id,idx)); return out; }
217 #endif
218 
219 
220 #define DEF_BIND_UNBIND_FETCH(outtype, intype, id) \
221  DEF_BIND_UNBIND(outtype, intype, id) \
222  DEF_FETCH(outtype, intype, id)
223 
224 
225 #define DEF_ALL(id) \
226  DECL_TEX(id) \
227  DEF_BIND_UNBIND_FETCH(float2, short2, id) \
228  DEF_BIND_UNBIND_FETCH(float4, short4, id) \
229  DEF_BIND_UNBIND_FETCH(float, float, id) \
230  DEF_BIND_UNBIND_FETCH(float2, float2, id) \
231  DEF_BIND_UNBIND_FETCH(float4, float4, id) \
232  DEF_BIND_UNBIND(double2, double2, id) \
233  DEF_BIND_UNBIND(float2, double2, id) \
234  DEF_FETCH_DBLE(double2, double2, id) \
235  DEF_FETCH_DBLE(float2, double2, id) \
236  DEF_BIND_UNBIND(double2, float2, id) \
237  DEF_BIND_UNBIND(double4, float4, id) \
238  DEF_BIND_UNBIND(double2, short2, id) \
239  DEF_BIND_UNBIND(double4, short4, id) \
240  DEF_FETCH_DBLE_MIXED(double2, float2, id) \
241  DEF_FETCH_DBLE_MIXED(double4, float4, id) \
242  DEF_FETCH_DBLE_MIXED(double2, short2, id) \
243  DEF_FETCH_DBLE_MIXED(double4, short4, id)
244 
245  // Declare the textures and define the member functions of the corresponding templated classes.
246  DEF_ALL(0)
247  DEF_ALL(1)
248  DEF_ALL(2)
249  DEF_ALL(3)
250  DEF_ALL(4)
251 
252 #undef DECL_TEX
253 #undef DEF_BIND_UNBIND
254 #undef DEF_FETCH_DIRECT
255 #undef DEF_FETCH_TEX
256 #undef DEF_FETCH
257 #undef DEF_FETCH_DBLE
258 #undef DEF_BIND_UNBIND_FETCH
259 #undef DEF_ALL
260 
261 #endif // USE_TEXTURE_OBJECTS
262 
263 
275  template <typename RegType, typename InterType, typename StoreType>
276  void checkTypes() {
277 
278  const size_t reg_size = sizeof(((RegType*)0)->x);
279  const size_t inter_size = sizeof(((InterType*)0)->x);
280  const size_t store_size = sizeof(((StoreType*)0)->x);
281 
282  if (reg_size != inter_size && store_size != 2 && inter_size != 4)
283  errorQuda("Precision of register (%lu) and intermediate (%lu) types must match\n",
284  (unsigned long)reg_size, (unsigned long)inter_size);
285 
286  if (vecLength<InterType>() != vecLength<StoreType>()) {
287  errorQuda("Vector lengths intermediate and register types must match\n");
288  }
289 
290  if (vecLength<RegType>() == 0) errorQuda("Vector type not supported\n");
291  if (vecLength<InterType>() == 0) errorQuda("Vector type not supported\n");
292  if (vecLength<StoreType>() == 0) errorQuda("Vector type not supported\n");
293 
294  }
295 
296  template <typename FloatN, int M>
297  __device__ inline float store_norm(float *norm, FloatN x[M], int i) {
298  float c[M];
299 #pragma unroll
300  for (int j=0; j<M; j++) c[j] = max_fabs(x[j]);
301 #pragma unroll
302  for (int j=1; j<M; j++) c[0] = fmaxf(c[j],c[0]);
303  norm[i] = c[0];
304  return __fdividef(MAX_SHORT, c[0]);
305  }
306 
307  // the number of elements per virtual register
308 #define REG_LENGTH (sizeof(RegType) / sizeof(((RegType*)0)->x))
309 
310 // whether the type is a shortN vector
311 #define IS_SHORT(type) (sizeof( ((type*)0)->x ) == sizeof(short) )
312 
321 template <typename RegType, typename InterType, typename StoreType, int N, int write, int tex_id=-1>
322  class Spinor {
323 
324  private:
325  StoreType *spinor;
326 #ifdef USE_TEXTURE_OBJECTS // texture objects
328 #else
330 #endif
331  float *norm; // direct reads for norm
332  int stride;
333 
334  public:
336  : spinor(0), tex(), norm(0), stride(0) { } // default constructor
337 
338  Spinor(const cudaColorSpinorField &x)
339  : spinor((StoreType*)x.V()), tex(&x), norm((float*)x.Norm()),
340  stride(x.Length()/(N*REG_LENGTH)) { checkTypes<RegType,InterType,StoreType>(); }
341 
342  Spinor(const Spinor &st)
343  : spinor(st.spinor), tex(st.tex), norm(st.norm), stride(st.stride) { }
344 
345  Spinor(StoreType* spinor, float* norm, int stride)
346  : spinor(spinor), norm(norm), stride(stride) { checkTypes<RegType, InterType, StoreType>(); }
347 
348  Spinor& operator=(const Spinor &src) {
349  if (&src != this) {
350  spinor = src.spinor;
351  tex = src.tex;
352  norm = src.norm;
353  stride = src.stride;
354  }
355  return *this;
356  }
357 
358  void set(const cudaColorSpinorField &x){
359  spinor = (StoreType*)x.V();
360 #ifdef USE_TEXTURE_OBJECTS
362 #else
364 #endif
365  norm = (float*)x.Norm();
366  stride = x.Length()/(N*REG_LENGTH);
367 
368  checkTypes<RegType,InterType,StoreType>();
369  }
370 
371  ~Spinor() { } /* on g80 / gt200 this must not be virtual */
372 
373  __device__ inline void load(RegType x[], const int i) {
374  // load data into registers first using the storage order
375  const int M = (N * sizeof(RegType)) / sizeof(InterType);
376  InterType y[M];
377 
378  // If we are using tex references, then we can only use the predeclared texture ids
379 #ifndef USE_TEXTURE_OBJECTS
380  if (tex_id >= 0 && tex_id <= MAX_TEX_ID) {
381 #endif
382  // half precision types
383  if ( IS_SHORT(StoreType) ) {
384  float xN = norm[i];
385 #pragma unroll
386  for (int j=0; j<M; j++) y[j] = xN*tex[i + j*stride];
387  } else { // other types
388 #pragma unroll
389  for (int j=0; j<M; j++) copyFloatN(y[j], tex[i + j*stride]);
390  }
391 #ifndef USE_TEXTURE_OBJECTS
392  } else { // default load when out of tex_id range
393 
394  if ( IS_SHORT(StoreType) ) {
395  float xN = norm[i];
396 #pragma unroll
397  for (int j=0; j<M; j++) {
398  copyFloatN(y[j], spinor[i + j*stride]);
399  y[j] *= xN;
400  }
401  } else { // other types
402 #pragma unroll
403  for (int j=0; j<M; j++) copyFloatN(y[j],spinor[i + j*stride]);
404  }
405  }
406 #endif
407 
408  // now convert into desired register order
409  convert<RegType, InterType>(x, y, N);
410  }
411 
412  // default store used for simple fields
413  __device__ inline void save(RegType x[], int i) {
414  if (write) {
415  const int M = (N * sizeof(RegType)) / sizeof(InterType);
416  InterType y[M];
417  convert<InterType, RegType>(y, x, M);
418 
419  if ( IS_SHORT(StoreType) ) {
420  float C = store_norm<InterType, M>(norm, y, i);
421 #pragma unroll
422  for (int j=0; j<M; j++) copyFloatN(spinor[i+j*stride], C*y[j]);
423  } else {
424 #pragma unroll
425  for (int j=0; j<M; j++) copyFloatN(spinor[i+j*stride], y[j]);
426  }
427  }
428  }
429 
430  // used to backup the field to the host
431  void save(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes) {
432  if (write) {
433  *spinor_h = new char[bytes];
434  cudaMemcpy(*spinor_h, spinor, bytes, cudaMemcpyDeviceToHost);
435  if (norm_bytes > 0) {
436  *norm_h = new char[norm_bytes];
437  cudaMemcpy(*norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
438  }
439  checkCudaError();
440  }
441  }
442 
443  // restore the field from the host
444  void load(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes) {
445  if (write) {
446  cudaMemcpy(spinor, *spinor_h, bytes, cudaMemcpyHostToDevice);
447  if (norm_bytes > 0) {
448  cudaMemcpy(norm, *norm_h, norm_bytes, cudaMemcpyHostToDevice);
449  delete []*norm_h;
450  *norm_h = 0;
451  }
452  delete []*spinor_h;
453  *spinor_h = 0;
454  checkCudaError();
455  }
456  }
457 
458  void* V() { return (void*)spinor; }
459  float* Norm() { return norm; }
460 
463  if (sizeof(((StoreType*)0)->x) == sizeof(double)) precision = QUDA_DOUBLE_PRECISION;
464  else if (sizeof(((StoreType*)0)->x) == sizeof(float)) precision = QUDA_SINGLE_PRECISION;
465  else if (sizeof(((StoreType*)0)->x) == sizeof(short)) precision = QUDA_HALF_PRECISION;
466  else errorQuda("Unknown precision type\n");
467  return precision;
468  }
469 
470  int Stride() const { return stride; }
471 
472  void setStride(int stride_) { stride = stride_; }
473  };
474 
475 //} // namespace quda
476 
477 #ifndef USE_TEXTURE_OBJECTS
478 #undef MAX_TEX_ID
479 #endif
480 
481 #endif // _TEXTURE_H
enum QudaPrecision_s QudaPrecision
int V
Definition: test_util.cpp:29
int y[4]
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:859
Spinor(StoreType *spinor, float *norm, int stride)
Definition: texture.h:345
~Texture()
Definition: texture.h:145
void load(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes)
Definition: texture.h:444
#define errorQuda(...)
Definition: util_quda.h:73
#define DEF_ALL(id)
Definition: texture.h:225
Texture(const cudaColorSpinorField *x)
Definition: texture.h:122
Spinor()
Definition: texture.h:335
void set(const cudaColorSpinorField &x)
Definition: texture.h:358
cpuColorSpinorField * spinor
Definition: dslash_test.cpp:40
__inline__ __device__ double fetch_double(texture< int2, 1 > t, int i)
Definition: texture.h:88
void unbind()
Definition: texture.h:158
__device__ void load(RegType x[], const int i)
Definition: texture.h:373
__device__ void copyFloatN(FloatN &a, const FloatN &b)
Definition: convert.h:34
QudaPrecision Precision()
Definition: texture.h:461
Spinor(const cudaColorSpinorField &x)
Definition: texture.h:338
__device__ float store_norm(float *norm, FloatN x[M], int i)
Definition: texture.h:297
~Spinor()
Definition: texture.h:371
void checkTypes()
Definition: texture.h:276
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
Definition: texture.h:90
Texture(const Texture &tex)
Definition: texture.h:139
int x[4]
#define REG_LENGTH
Definition: texture.h:308
Texture()
Definition: texture.h:116
__device__ void save(RegType x[], int i)
Definition: texture.h:413
cpuColorSpinorField * out
void save(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes)
Definition: texture.h:431
__forceinline__ __host__ __device__ float max_fabs(const float4 &c)
Definition: float_vector.h:177
#define MAX_SHORT
Definition: quda_internal.h:30
Texture & operator=(const Texture &tex)
Definition: texture.h:149
__device__ OutputType operator[](unsigned int idx)
Definition: texture.h:162
float * Norm()
Definition: texture.h:459
#define checkCudaError()
Definition: util_quda.h:110
#define IS_SHORT(type)
Definition: texture.h:311
void * V()
Definition: texture.h:458
int Stride() const
Definition: texture.h:470
Spinor & operator=(const Spinor &src)
Definition: texture.h:348
Spinor(const Spinor &st)
Definition: texture.h:342
void setStride(int stride_)
Definition: texture.h:472
__device__ OutputType fetch(unsigned int idx)
Definition: texture.h:161
bool tex_id_table[MAX_TEX_ID]
Definition: texture.h:102
void bind(const InputType *, size_t bytes)
Definition: texture.h:157
#define MAX_TEX_ID
Definition: texture.h:99