QUDA: quda/include/texture.h Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <convert.h>
00002 
00003 #pragma once
00004 
00005 #if (__COMPUTE_CAPABILITY__ >= 130)
00006 __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
00007 {
00008   int4 v = tex1Dfetch(t,i);
00009   return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
00010 }
00011 #else
00012 __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i)
00013 {
00014   // do nothing
00015   return make_double2(0.0, 0.0);
00016 }
00017 #endif
00018 
00019 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_0;
00020 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_1;
00021 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_2;
00022 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_3;
00023 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_4;
00024 
00025 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_0;
00026 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_1;
00027 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_2;
00028 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_3;
00029 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_4;
00030 
00031 texture<float,1> tex_float_0;
00032 texture<float,1> tex_float_1;
00033 texture<float,1> tex_float_2;
00034 texture<float,1> tex_float_3;
00035 texture<float,1> tex_float_4;
00036 
00037 texture<float2,1> tex_float2_0;
00038 texture<float2,1> tex_float2_1;
00039 texture<float2,1> tex_float2_2;
00040 texture<float2,1> tex_float2_3;
00041 texture<float2,1> tex_float2_4;
00042 
00043 texture<float4,1> tex_float4_0;
00044 texture<float4,1> tex_float4_1;
00045 texture<float4,1> tex_float4_2;
00046 texture<float4,1> tex_float4_3;
00047 texture<float4,1> tex_float4_4;
00048 
00049 texture<int4,1> tex_int4_0;
00050 texture<int4,1> tex_int4_1;
00051 texture<int4,1> tex_int4_2;
00052 texture<int4,1> tex_int4_3;
00053 texture<int4,1> tex_int4_4;
00054 
00055 #define MAX_TEXELS (1<<27)
00056 
00057 template<typename OutputType, typename InputType, int tex_id=0>
00058 class Texture {
00059  private: 
00060  const InputType *spinor; // used when textures are disabled
00061  //size_t bytes;
00062 
00063  public:
00064  Texture() { ; }
00065  Texture(const InputType *x, size_t bytes) : spinor(x)/*, bytes(bytes)*/ { 
00066 
00067    if (bytes) bind(x, MAX_TEXELS*sizeof(InputType)); // only bind if bytes > 0
00068    //if (bytes) bind(x, bytes); // only bind if bytes > 0
00069  }
00070  ~Texture() { /*if (bytes) */ /*unbind()*/; } // unbinding is unnecessary and costly
00071 
00072  Texture& operator=(const Texture &tex) {
00073    spinor = tex.spinor;
00074    return *this;
00075  }
00076 
00077  inline void bind(const InputType*, size_t bytes){ errorQuda("Texture id is out of range"); }
00078  inline void unbind() { errorQuda("Texture id is out of range"); }
00079 
00080  //default should only be called if a tex_id is out of range
00081  __device__ inline OutputType fetch(unsigned int idx) { return 0; };  
00082  __device__ inline OutputType operator[](unsigned int idx) { return fetch(idx); }
00083 };
00084 
00085 template<> inline void Texture<float2,short2,0>::bind(const short2 *ptr, size_t bytes) 
00086 { cudaBindTexture(0,tex_short2_0, ptr, bytes); }
00087 template<> inline void Texture<float2,short2,1>::bind(const short2 *ptr, size_t bytes) 
00088 { cudaBindTexture(0,tex_short2_1, ptr, bytes); }
00089 template<> inline void Texture<float2,short2,2>::bind(const short2 *ptr, size_t bytes) 
00090 { cudaBindTexture(0,tex_short2_2, ptr, bytes); }
00091 template<> inline void Texture<float2,short2,3>::bind(const short2 *ptr, size_t bytes) 
00092 { cudaBindTexture(0,tex_short2_3, ptr, bytes); }
00093 template<> inline void Texture<float2,short2,4>::bind(const short2 *ptr, size_t bytes) 
00094 { cudaBindTexture(0,tex_short2_4, ptr, bytes); }
00095 
00096 template<> inline void Texture<float4,short4,0>::bind(const short4 *ptr, size_t bytes) 
00097 { cudaBindTexture(0,tex_short4_0, ptr, bytes); }
00098 template<> inline void Texture<float4,short4,1>::bind(const short4 *ptr, size_t bytes) 
00099 { cudaBindTexture(0,tex_short4_1, ptr, bytes); }
00100 template<> inline void Texture<float4,short4,2>::bind(const short4 *ptr, size_t bytes) 
00101 { cudaBindTexture(0,tex_short4_2, ptr, bytes); }
00102 template<> inline void Texture<float4,short4,3>::bind(const short4 *ptr, size_t bytes) 
00103 { cudaBindTexture(0,tex_short4_3, ptr, bytes); }
00104 template<> inline void Texture<float4,short4,4>::bind(const short4 *ptr, size_t bytes) 
00105 { cudaBindTexture(0,tex_short4_4, ptr, bytes); }
00106 
00107 template<> inline void Texture<float,float,0>::bind(const float *ptr, size_t bytes) 
00108 { cudaBindTexture(0,tex_float_0, ptr, bytes); }
00109 template<> inline void Texture<float,float,1>::bind(const float *ptr, size_t bytes) 
00110 { cudaBindTexture(0,tex_float_1, ptr, bytes); }
00111 template<> inline void Texture<float,float,2>::bind(const float *ptr, size_t bytes) 
00112 { cudaBindTexture(0,tex_float_2, ptr, bytes); }
00113 template<> inline void Texture<float,float,3>::bind(const float *ptr, size_t bytes) 
00114 { cudaBindTexture(0,tex_float_3, ptr, bytes); }
00115 template<> inline void Texture<float,float,4>::bind(const float *ptr, size_t bytes) 
00116 { cudaBindTexture(0,tex_float_4, ptr, bytes); }
00117 
00118 template<> inline void Texture<float2,float2,0>::bind(const float2 *ptr, size_t bytes) 
00119 { cudaBindTexture(0,tex_float2_0, ptr, bytes); }
00120 template<> inline void Texture<float2,float2,1>::bind(const float2 *ptr, size_t bytes) 
00121 { cudaBindTexture(0,tex_float2_1, ptr, bytes); }
00122 template<> inline void Texture<float2,float2,2>::bind(const float2 *ptr, size_t bytes) 
00123 { cudaBindTexture(0,tex_float2_2, ptr, bytes); }
00124 template<> inline void Texture<float2,float2,3>::bind(const float2 *ptr, size_t bytes) 
00125 { cudaBindTexture(0,tex_float2_3, ptr, bytes); }
00126 template<> inline void Texture<float2,float2,4>::bind(const float2 *ptr, size_t bytes) 
00127 { cudaBindTexture(0,tex_float2_4, ptr, bytes); }
00128 
00129 template<> inline void Texture<float4,float4,0>::bind(const float4 *ptr, size_t bytes) 
00130 { cudaBindTexture(0,tex_float4_0, ptr, bytes); }
00131 template<> inline void Texture<float4,float4,1>::bind(const float4 *ptr, size_t bytes) 
00132 { cudaBindTexture(0,tex_float4_1, ptr, bytes); }
00133 template<> inline void Texture<float4,float4,2>::bind(const float4 *ptr, size_t bytes) 
00134 { cudaBindTexture(0,tex_float4_2, ptr, bytes); }
00135 template<> inline void Texture<float4,float4,3>::bind(const float4 *ptr, size_t bytes) 
00136 { cudaBindTexture(0,tex_float4_3, ptr, bytes); }
00137 template<> inline void Texture<float4,float4,4>::bind(const float4 *ptr, size_t bytes) 
00138 { cudaBindTexture(0,tex_float4_4, ptr, bytes); }
00139 
00140 template<> inline void Texture<double2,double2,0>::bind(const double2 *ptr, size_t bytes) 
00141 { cudaBindTexture(0,tex_int4_0, ptr, bytes); }
00142 template<> inline void Texture<double2,double2,1>::bind(const double2 *ptr, size_t bytes) 
00143 { cudaBindTexture(0,tex_int4_1, ptr, bytes); }
00144 template<> inline void Texture<double2,double2,2>::bind(const double2 *ptr, size_t bytes) 
00145 { cudaBindTexture(0,tex_int4_2, ptr, bytes); }
00146 template<> inline void Texture<double2,double2,3>::bind(const double2 *ptr, size_t bytes) 
00147 { cudaBindTexture(0,tex_int4_3, ptr, bytes); }
00148 template<> inline void Texture<double2,double2,4>::bind(const double2 *ptr, size_t bytes) 
00149 { cudaBindTexture(0,tex_int4_4, ptr, bytes); }
00150 
00151 template<> inline void Texture<float2,double2,0>::bind(const double2 *ptr, size_t bytes) 
00152 { cudaBindTexture(0,tex_int4_0, ptr, bytes); }
00153 template<> inline void Texture<float2,double2,1>::bind(const double2 *ptr, size_t bytes) 
00154 { cudaBindTexture(0,tex_int4_1, ptr, bytes); }
00155 template<> inline void Texture<float2,double2,2>::bind(const double2 *ptr, size_t bytes) 
00156 { cudaBindTexture(0,tex_int4_2, ptr, bytes); }
00157 template<> inline void Texture<float2,double2,3>::bind(const double2 *ptr, size_t bytes) 
00158 { cudaBindTexture(0,tex_int4_3, ptr, bytes); }
00159 template<> inline void Texture<float2,double2,4>::bind(const double2 *ptr, size_t bytes) 
00160 { cudaBindTexture(0,tex_int4_4, ptr, bytes); }
00161 
00162 template<> inline void Texture<float2,short2,0>::unbind() { cudaUnbindTexture(tex_short2_0); }
00163 template<> inline void Texture<float2,short2,1>::unbind() { cudaUnbindTexture(tex_short2_1); }
00164 template<> inline void Texture<float2,short2,2>::unbind() { cudaUnbindTexture(tex_short2_2); }
00165 template<> inline void Texture<float2,short2,3>::unbind() { cudaUnbindTexture(tex_short2_3); }
00166 template<> inline void Texture<float2,short2,4>::unbind() { cudaUnbindTexture(tex_short2_4); }
00167 
00168 template<> inline void Texture<float4,short4,0>::unbind() { cudaUnbindTexture(tex_short4_0); }
00169 template<> inline void Texture<float4,short4,1>::unbind() { cudaUnbindTexture(tex_short4_1); }
00170 template<> inline void Texture<float4,short4,2>::unbind() { cudaUnbindTexture(tex_short4_2); }
00171 template<> inline void Texture<float4,short4,3>::unbind() { cudaUnbindTexture(tex_short4_3); }
00172 template<> inline void Texture<float4,short4,4>::unbind() { cudaUnbindTexture(tex_short4_4); }
00173 
00174 template<> inline void Texture<float,float,0>::unbind() { cudaUnbindTexture(tex_float_0); }
00175 template<> inline void Texture<float,float,1>::unbind() { cudaUnbindTexture(tex_float_1); }
00176 template<> inline void Texture<float,float,2>::unbind() { cudaUnbindTexture(tex_float_2); }
00177 template<> inline void Texture<float,float,3>::unbind() { cudaUnbindTexture(tex_float_3); }
00178 template<> inline void Texture<float,float,4>::unbind() { cudaUnbindTexture(tex_float_4); }
00179 
00180 template<> inline void Texture<float2,float2,0>::unbind() { cudaUnbindTexture(tex_float2_0); }
00181 template<> inline void Texture<float2,float2,1>::unbind() { cudaUnbindTexture(tex_float2_1); }
00182 template<> inline void Texture<float2,float2,2>::unbind() { cudaUnbindTexture(tex_float2_2); }
00183 template<> inline void Texture<float2,float2,3>::unbind() { cudaUnbindTexture(tex_float2_3); }
00184 template<> inline void Texture<float2,float2,4>::unbind() { cudaUnbindTexture(tex_float2_4); }
00185 
00186 template<> inline void Texture<float4,float4,0>::unbind() { cudaUnbindTexture(tex_float4_0); }
00187 template<> inline void Texture<float4,float4,1>::unbind() { cudaUnbindTexture(tex_float4_1); }
00188 template<> inline void Texture<float4,float4,2>::unbind() { cudaUnbindTexture(tex_float4_2); }
00189 template<> inline void Texture<float4,float4,3>::unbind() { cudaUnbindTexture(tex_float4_3); }
00190 template<> inline void Texture<float4,float4,4>::unbind() { cudaUnbindTexture(tex_float4_4); }
00191 
00192 template<> inline void Texture<double2,double2,0>::unbind() { cudaUnbindTexture(tex_int4_0); }
00193 template<> inline void Texture<double2,double2,1>::unbind() { cudaUnbindTexture(tex_int4_1); }
00194 template<> inline void Texture<double2,double2,2>::unbind() { cudaUnbindTexture(tex_int4_2); }
00195 template<> inline void Texture<double2,double2,3>::unbind() { cudaUnbindTexture(tex_int4_3); }
00196 template<> inline void Texture<double2,double2,4>::unbind() { cudaUnbindTexture(tex_int4_4); }
00197 
00198 template<> inline void Texture<float2,double2,0>::unbind() { cudaUnbindTexture(tex_int4_0); }
00199 template<> inline void Texture<float2,double2,1>::unbind() { cudaUnbindTexture(tex_int4_1); }
00200 template<> inline void Texture<float2,double2,2>::unbind() { cudaUnbindTexture(tex_int4_2); }
00201 template<> inline void Texture<float2,double2,3>::unbind() { cudaUnbindTexture(tex_int4_3); }
00202 template<> inline void Texture<float2,double2,4>::unbind() { cudaUnbindTexture(tex_int4_4); }
00203 
00204 // short2
00205 template<> __device__ inline float2 Texture<float2,short2,0>::fetch(unsigned int idx) 
00206 { return tex1Dfetch(tex_short2_0,idx); }
00207 template<> __device__ inline float2 Texture<float2,short2,1>::fetch(unsigned int idx) 
00208 { return tex1Dfetch(tex_short2_1,idx); }
00209 template<> __device__ inline float2 Texture<float2,short2,2>::fetch(unsigned int idx) 
00210 { return tex1Dfetch(tex_short2_2,idx); }
00211 template<> __device__ inline float2 Texture<float2,short2,3>::fetch(unsigned int idx) 
00212 { return tex1Dfetch(tex_short2_3,idx); }
00213 template<> __device__ inline float2 Texture<float2,short2,4>::fetch(unsigned int idx) 
00214 { return tex1Dfetch(tex_short2_4,idx); }
00215 
00216 // short4
00217 template<> __device__ inline float4 Texture<float4,short4,0>::fetch(unsigned int idx) 
00218 { return tex1Dfetch(tex_short4_0,idx); }
00219 template<> __device__ inline float4 Texture<float4,short4,1>::fetch(unsigned int idx) 
00220 { return tex1Dfetch(tex_short4_1,idx); }
00221 template<> __device__ inline float4 Texture<float4,short4,2>::fetch(unsigned int idx) 
00222 { return tex1Dfetch(tex_short4_2,idx); }
00223 template<> __device__ inline float4 Texture<float4,short4,3>::fetch(unsigned int idx) 
00224 { return tex1Dfetch(tex_short4_3,idx); }
00225 template<> __device__ inline float4 Texture<float4,short4,4>::fetch(unsigned int idx) 
00226 { return tex1Dfetch(tex_short4_4,idx); }
00227 
00228 // float
00229 template<> __device__ inline float Texture<float,float,0>::fetch(unsigned int idx) 
00230 { return tex1Dfetch(tex_float_0,idx); }
00231 template<> __device__ inline float Texture<float,float,1>::fetch(unsigned int idx) 
00232 { return tex1Dfetch(tex_float_1,idx); }
00233 template<> __device__ inline float Texture<float,float,2>::fetch(unsigned int idx) 
00234 { return tex1Dfetch(tex_float_2,idx); }
00235 template<> __device__ inline float Texture<float,float,3>::fetch(unsigned int idx) 
00236 { return tex1Dfetch(tex_float_3,idx); }
00237 template<> __device__ inline float Texture<float,float,4>::fetch(unsigned int idx) 
00238 { return tex1Dfetch(tex_float_4,idx); }
00239 
00240 // float2
00241 template<> __device__ inline float2 Texture<float2,float2,0>::fetch(unsigned int idx) 
00242 { return tex1Dfetch(tex_float2_0,idx); }
00243 template<> __device__ inline float2 Texture<float2,float2,1>::fetch(unsigned int idx) 
00244 { return tex1Dfetch(tex_float2_1,idx); }
00245 template<> __device__ inline float2 Texture<float2,float2,2>::fetch(unsigned int idx) 
00246 { return tex1Dfetch(tex_float2_2,idx); }
00247 template<> __device__ inline float2 Texture<float2,float2,3>::fetch(unsigned int idx) 
00248 { return tex1Dfetch(tex_float2_3,idx); }
00249 template<> __device__ inline float2 Texture<float2,float2,4>::fetch(unsigned int idx) 
00250 { return tex1Dfetch(tex_float2_4,idx); }
00251 
00252 // float4
00253 template<> __device__ inline float4 Texture<float4,float4,0>::fetch(unsigned int idx) 
00254 { return tex1Dfetch(tex_float4_0,idx); }
00255 template<> __device__ inline float4 Texture<float4,float4,1>::fetch(unsigned int idx) 
00256 { return tex1Dfetch(tex_float4_1,idx); }
00257 template<> __device__ inline float4 Texture<float4,float4,2>::fetch(unsigned int idx) 
00258 { return tex1Dfetch(tex_float4_2,idx); }
00259 template<> __device__ inline float4 Texture<float4,float4,3>::fetch(unsigned int idx) 
00260 { return tex1Dfetch(tex_float4_3,idx); }
00261 template<> __device__ inline float4 Texture<float4,float4,4>::fetch(unsigned int idx) 
00262 { return tex1Dfetch(tex_float4_4,idx); }
00263 
00264 // double2
00265 #ifndef FERMI_NO_DBLE_TEX
00266 template<> __device__ inline double2 Texture<double2,double2,0>::fetch(unsigned int idx) 
00267 { return fetch_double2(tex_int4_0,idx); }
00268 template<> __device__ inline double2 Texture<double2,double2,1>::fetch(unsigned int idx) 
00269 { return fetch_double2(tex_int4_1,idx); }
00270 template<> __device__ inline double2 Texture<double2,double2,2>::fetch(unsigned int idx) 
00271 { return fetch_double2(tex_int4_2,idx); }
00272 template<> __device__ inline double2 Texture<double2,double2,3>::fetch(unsigned int idx) 
00273 { return fetch_double2(tex_int4_3,idx); }
00274 template<> __device__ inline double2 Texture<double2,double2,4>::fetch(unsigned int idx) 
00275 { return fetch_double2(tex_int4_4,idx); }
00276 
00277 template<> __device__ inline float2 Texture<float2,double2,0>::fetch(unsigned int idx) 
00278 { double2 x = fetch_double2(tex_int4_0,idx); return make_float2(x.x, x.y); }
00279 template<> __device__ inline float2 Texture<float2,double2,1>::fetch(unsigned int idx) 
00280 { double2 x = fetch_double2(tex_int4_1,idx); return make_float2(x.x, x.y); }
00281 template<> __device__ inline float2 Texture<float2,double2,2>::fetch(unsigned int idx) 
00282 { double2 x = fetch_double2(tex_int4_2,idx); return make_float2(x.x, x.y); }
00283 template<> __device__ inline float2 Texture<float2,double2,3>::fetch(unsigned int idx) 
00284 { double2 x = fetch_double2(tex_int4_3,idx); return make_float2(x.x, x.y); }
00285 template<> __device__ inline float2 Texture<float2,double2,4>::fetch(unsigned int idx) 
00286 { double2 x = fetch_double2(tex_int4_4,idx); return make_float2(x.x, x.y); }
00287 
00288 #else
00289 
00290 template<> __device__ inline double2 Texture<double2,double2,0>::fetch(unsigned int idx) 
00291 { return spinor[idx]; }
00292 template<> __device__ inline double2 Texture<double2,double2,1>::fetch(unsigned int idx) 
00293 { return spinor[idx]; }
00294 template<> __device__ inline double2 Texture<double2,double2,2>::fetch(unsigned int idx) 
00295 { return spinor[idx]; }
00296 template<> __device__ inline double2 Texture<double2,double2,3>::fetch(unsigned int idx) 
00297 { return spinor[idx]; }
00298 template<> __device__ inline double2 Texture<double2,double2,4>::fetch(unsigned int idx) 
00299 { return spinor[idx]; }
00300 
00301 template<> __device__ inline float2 Texture<float2,double2,0>::fetch(unsigned int idx) 
00302 { double2 x = spinor[idx]; return make_float2(x.x, x.y); }
00303 template<> __device__ inline float2 Texture<float2,double2,1>::fetch(unsigned int idx) 
00304 { double2 x = spinor[idx]; return make_float2(x.x, x.y); }
00305 template<> __device__ inline float2 Texture<float2,double2,2>::fetch(unsigned int idx) 
00306 { double2 x = spinor[idx]; return make_float2(x.x, x.y); }
00307 template<> __device__ inline float2 Texture<float2,double2,3>::fetch(unsigned int idx) 
00308 { double2 x = spinor[idx]; return make_float2(x.x, x.y); }
00309 template<> __device__ inline float2 Texture<float2,double2,4>::fetch(unsigned int idx) 
00310 { double2 x = spinor[idx]; return make_float2(x.x, x.y); }
00311 
00312 #endif //  FERMI_NO_DBLE_TEX
00313 
00325 template <typename RegType, typename InterType, typename StoreType>
00326 void checkTypes() {
00327 
00328   const size_t reg_size = sizeof(((RegType*)0)->x);
00329   const size_t inter_size = sizeof(((InterType*)0)->x);
00330   const size_t store_size = sizeof(((StoreType*)0)->x);
00331 
00332   if (reg_size != inter_size  && store_size != 2 && inter_size != 4)
00333     errorQuda("Precision of register (%lu) and intermediate (%lu) types must match\n",
00334               reg_size, inter_size);
00335   
00336   if (vecLength<InterType>() != vecLength<StoreType>()) {
00337     errorQuda("Vector lengths intermediate and register types must match\n");
00338   }
00339 
00340   if (vecLength<RegType>() == 0) errorQuda("Vector type not supported\n");
00341   if (vecLength<InterType>() == 0) errorQuda("Vector type not supported\n");
00342   if (vecLength<StoreType>() == 0) errorQuda("Vector type not supported\n");
00343 
00344 }
00345 
00346 // FIXME: Can we merge the Spinor and SpinorTexture objects so that
00347 // reading from texture is simply a constructor option?
00348 
00349 // the number of elements per virtual register
00350 #define REG_LENGTH (sizeof(RegType) / sizeof(((RegType*)0)->x))
00351 
00359 template <typename RegType, typename InterType, typename StoreType, int N, int tex_id>
00360 class SpinorTexture {
00361 
00362  private:
00363   Texture<InterType, StoreType, tex_id> spinor;
00364   /* It's faster to always use direct reads for the norm, but leave
00365      this option in there for the future.*/
00366 #if (__COMPUTE_CAPABILITY__ >= 000)
00367   float *norm;
00368 #else
00369   Texture<float, float, tex_id> norm;
00370 #endif
00371   int stride;
00372 
00373  public:
00374 
00375 #if (__COMPUTE_CAPABILITY__ >= 000)
00376  SpinorTexture() 
00377    : spinor((StoreType*)0, 0), norm(0), stride(0) {;} // default constructor
00378 
00379  SpinorTexture(const cudaColorSpinorField &x) 
00380    : spinor((StoreType*)x.V(), x.Bytes()), norm((float*)x.Norm()),
00381     stride(x.Length()/(N*REG_LENGTH)) { checkTypes<RegType,InterType,StoreType>(); }
00382 #else
00383  SpinorTexture()
00384   : spinor((StoreType*)0, 0), norm(0, 0), stride(0) {;} // default constructor
00385 
00386  SpinorTexture(const cudaColorSpinorField &x) 
00387    : spinor((StoreType*)x.V(), x.Bytes()), norm((float*)x.Norm(), x.NormBytes()),
00388     stride(x.Length()/(N*REG_LENGTH)) { checkTypes<RegType,InterType,StoreType>(); }
00389 #endif
00390 
00391   ~SpinorTexture() {;}
00392 
00393   SpinorTexture& operator=(const SpinorTexture &src) {
00394     if (&src != this) {
00395       spinor = src.spinor;
00396       norm = src.norm;
00397       stride = src.stride;
00398     }
00399     return *this;
00400   }
00401 
00402 
00403   __device__ inline void load(RegType x[], const int i) {
00404     // load data into registers first using the storage order
00405     const int M = (N * sizeof(RegType)) / sizeof(InterType);
00406     InterType y[M];
00407 
00408     // half precision types
00409     if (sizeof(InterType) == 2*sizeof(StoreType)) { 
00410       float xN = norm[i];
00411 #pragma unroll
00412       for (int j=0; j<M; j++) {
00413         y[j] = spinor[i + j*stride];
00414         y[j] *= xN;
00415       }
00416     } else { // other types
00417 #pragma unroll 
00418       for (int j=0; j<M; j++) copyFloatN(y[j], spinor[i + j*stride]);
00419     }
00420 
00421     // now convert into desired register order
00422     convert<RegType, InterType>(x, y, N);
00423   }
00424 
00425   // no save method for Textures
00426 
00427   QudaPrecision Precision() { 
00428     QudaPrecision precision = QUDA_INVALID_PRECISION;
00429     if (sizeof(((StoreType*)0)->x) == sizeof(double)) precision = QUDA_DOUBLE_PRECISION;
00430     else if (sizeof(((StoreType*)0)->x) == sizeof(float)) precision = QUDA_SINGLE_PRECISION;
00431     else if (sizeof(((StoreType*)0)->x) == sizeof(short)) precision = QUDA_HALF_PRECISION;
00432     else errorQuda("Unknown precision type\n");
00433     return precision;
00434   }
00435 
00436   int Stride() { return stride; }
00437 };
00438 
00445 template <typename RegType, typename InterType, typename StoreType, int N>
00446 class Spinor {
00447 
00448  private:
00449   StoreType *spinor;
00450   float *norm;
00451   const int stride;
00452 
00453  public:
00454  Spinor(cudaColorSpinorField &x) : 
00455   spinor((StoreType*)x.V()), norm((float*)x.Norm()),  stride(x.Length()/(N*REG_LENGTH))
00456     { checkTypes<RegType,InterType,StoreType>(); } 
00457 
00458  Spinor(const cudaColorSpinorField &x) :
00459   spinor((StoreType*)x.V()), norm((float*)x.Norm()), stride(x.Length()/(N*REG_LENGTH))
00460     { checkTypes<RegType,InterType,StoreType>(); } 
00461   ~Spinor() {;}
00462 
00463   // default load used for simple fields
00464   __device__ inline void load(RegType x[], const int i) {
00465     // load data into registers first
00466     const int M = (N * sizeof(RegType)) / sizeof(InterType);
00467     InterType y[M];
00468 #pragma unroll
00469     for (int j=0; j<M; j++) copyFloatN(y[j],spinor[i + j*stride]);
00470 
00471     convert<RegType, InterType>(x, y, N);
00472   }
00473 
00474   // default store used for simple fields
00475   __device__ inline void save(RegType x[], int i) {
00476     const int M = (N * sizeof(RegType)) / sizeof(InterType);
00477     InterType y[M];
00478     convert<InterType, RegType>(y, x, M);
00479 #pragma unroll
00480     for (int j=0; j<M; j++) copyFloatN(spinor[i+j*stride], y[j]);
00481   }
00482 
00483   // used to backup the field to the host
00484   void save(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes) {
00485     *spinor_h = new char[bytes];
00486     cudaMemcpy(*spinor_h, spinor, bytes, cudaMemcpyDeviceToHost);
00487     if (norm_bytes > 0) {
00488       *norm_h = new char[norm_bytes];
00489       cudaMemcpy(*norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
00490     }
00491     checkCudaError();
00492   }
00493 
00494   // restore the field from the host
00495   void load(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes) {
00496     cudaMemcpy(spinor, *spinor_h, bytes, cudaMemcpyHostToDevice);
00497     if (norm_bytes > 0) {
00498       cudaMemcpy(norm, *norm_h, norm_bytes, cudaMemcpyHostToDevice);
00499       delete(*norm_h);
00500     }
00501     delete(*spinor_h);
00502     checkCudaError();
00503   }
00504 
00505   void* V() { return (void*)spinor; }
00506   float* Norm() { return norm; }
00507   QudaPrecision Precision() { 
00508     QudaPrecision precision = QUDA_INVALID_PRECISION;
00509     if (sizeof(((StoreType*)0)->x) == sizeof(double)) precision = QUDA_DOUBLE_PRECISION;
00510     else if (sizeof(((StoreType*)0)->x) == sizeof(float)) precision = QUDA_SINGLE_PRECISION;
00511     else if (sizeof(((StoreType*)0)->x) == sizeof(short)) precision = QUDA_HALF_PRECISION;
00512     else errorQuda("Unknown precision type\n");
00513     return precision;
00514   }
00515 
00516   int Stride() { return stride; }
00517 };
00518 
00519 template <typename OutputType, typename InputType, int M>
00520   __device__ inline void saveHalf(OutputType *x_o, float *norm, InputType x_i[M], int i, int stride) {
00521   float c[M];
00522 #pragma unroll
00523   for (int j=0; j<M; j++) c[j] = max_fabs(x_i[j]);
00524 #pragma unroll
00525   for (int j=1; j<M; j++) c[0] = fmaxf(c[j],c[0]);
00526   
00527   norm[i] = c[0]; // store norm value
00528 
00529   // store spinor values
00530   float C = __fdividef(MAX_SHORT, c[0]);  
00531 #pragma unroll
00532   for (int j=0; j<M; j++) {    
00533     x_o[i+j*stride] = make_shortN(C*x_i[j]); 
00534   }
00535 }
00536 
00537 template <>
00538 __device__ inline void Spinor<float2, float2, short2, 3>::save(float2 x[3], int i) {
00539   saveHalf<short2, float2, 3>(spinor, norm, x, i, stride);
00540 }
00541 
00542 template <>
00543 __device__ inline void Spinor<float4, float4, short4, 6>::save(float4 x[6], int i) {
00544   saveHalf<short4, float4, 6>(spinor, norm, x, i, stride);
00545 }
00546 
00547 template <>
00548 __device__ inline void Spinor<double2, double2, short2, 3>::save(double2 x[3], int i) {
00549   saveHalf<short2, double2, 3>(spinor, norm, x, i, stride);
00550 }
00551 
00552 template <>
00553 __device__ inline void Spinor<double2, double4, short4, 12>::save(double2 x[12], int i) {
00554   double4 y[6];
00555   convert<double4, double2>(y, x, 6);
00556   saveHalf<short4, double4, 6>(spinor, norm, y, i, stride);
00557 }
00558