QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <convert.h> 00002 00003 #pragma once 00004 00005 #if (__COMPUTE_CAPABILITY__ >= 130) 00006 __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i) 00007 { 00008 int4 v = tex1Dfetch(t,i); 00009 return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); 00010 } 00011 #else 00012 __inline__ __device__ double2 fetch_double2(texture<int4, 1> t, int i) 00013 { 00014 // do nothing 00015 return make_double2(0.0, 0.0); 00016 } 00017 #endif 00018 00019 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_0; 00020 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_1; 00021 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_2; 00022 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_3; 00023 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_4; 00024 00025 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_0; 00026 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_1; 00027 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_2; 00028 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_3; 00029 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_4; 00030 00031 texture<float,1> tex_float_0; 00032 texture<float,1> tex_float_1; 00033 texture<float,1> tex_float_2; 00034 texture<float,1> tex_float_3; 00035 texture<float,1> tex_float_4; 00036 00037 texture<float2,1> tex_float2_0; 00038 texture<float2,1> tex_float2_1; 00039 texture<float2,1> tex_float2_2; 00040 texture<float2,1> tex_float2_3; 00041 texture<float2,1> tex_float2_4; 00042 00043 texture<float4,1> tex_float4_0; 00044 texture<float4,1> tex_float4_1; 00045 texture<float4,1> tex_float4_2; 00046 texture<float4,1> tex_float4_3; 00047 texture<float4,1> tex_float4_4; 00048 00049 texture<int4,1> tex_int4_0; 00050 texture<int4,1> tex_int4_1; 00051 texture<int4,1> tex_int4_2; 00052 texture<int4,1> tex_int4_3; 00053 texture<int4,1> tex_int4_4; 00054 00055 #define MAX_TEXELS (1<<27) 00056 00057 template<typename OutputType, typename InputType, int tex_id=0> 00058 class Texture { 00059 private: 00060 const InputType *spinor; // used when textures are disabled 00061 //size_t bytes; 00062 00063 public: 00064 Texture() { ; } 00065 Texture(const InputType *x, size_t bytes) : spinor(x)/*, bytes(bytes)*/ { 00066 00067 if (bytes) bind(x, MAX_TEXELS*sizeof(InputType)); // only bind if bytes > 0 00068 //if (bytes) bind(x, bytes); // only bind if bytes > 0 00069 } 00070 ~Texture() { /*if (bytes) */ /*unbind()*/; } // unbinding is unnecessary and costly 00071 00072 Texture& operator=(const Texture &tex) { 00073 spinor = tex.spinor; 00074 return *this; 00075 } 00076 00077 inline void bind(const InputType*, size_t bytes){ errorQuda("Texture id is out of range"); } 00078 inline void unbind() { errorQuda("Texture id is out of range"); } 00079 00080 //default should only be called if a tex_id is out of range 00081 __device__ inline OutputType fetch(unsigned int idx) { return 0; }; 00082 __device__ inline OutputType operator[](unsigned int idx) { return fetch(idx); } 00083 }; 00084 00085 template<> inline void Texture<float2,short2,0>::bind(const short2 *ptr, size_t bytes) 00086 { cudaBindTexture(0,tex_short2_0, ptr, bytes); } 00087 template<> inline void Texture<float2,short2,1>::bind(const short2 *ptr, size_t bytes) 00088 { cudaBindTexture(0,tex_short2_1, ptr, bytes); } 00089 template<> inline void Texture<float2,short2,2>::bind(const short2 *ptr, size_t bytes) 00090 { cudaBindTexture(0,tex_short2_2, ptr, bytes); } 00091 template<> inline void Texture<float2,short2,3>::bind(const short2 *ptr, size_t bytes) 00092 { cudaBindTexture(0,tex_short2_3, ptr, bytes); } 00093 template<> inline void Texture<float2,short2,4>::bind(const short2 *ptr, size_t bytes) 00094 { cudaBindTexture(0,tex_short2_4, ptr, bytes); } 00095 00096 template<> inline void Texture<float4,short4,0>::bind(const short4 *ptr, size_t bytes) 00097 { cudaBindTexture(0,tex_short4_0, ptr, bytes); } 00098 template<> inline void Texture<float4,short4,1>::bind(const short4 *ptr, size_t bytes) 00099 { cudaBindTexture(0,tex_short4_1, ptr, bytes); } 00100 template<> inline void Texture<float4,short4,2>::bind(const short4 *ptr, size_t bytes) 00101 { cudaBindTexture(0,tex_short4_2, ptr, bytes); } 00102 template<> inline void Texture<float4,short4,3>::bind(const short4 *ptr, size_t bytes) 00103 { cudaBindTexture(0,tex_short4_3, ptr, bytes); } 00104 template<> inline void Texture<float4,short4,4>::bind(const short4 *ptr, size_t bytes) 00105 { cudaBindTexture(0,tex_short4_4, ptr, bytes); } 00106 00107 template<> inline void Texture<float,float,0>::bind(const float *ptr, size_t bytes) 00108 { cudaBindTexture(0,tex_float_0, ptr, bytes); } 00109 template<> inline void Texture<float,float,1>::bind(const float *ptr, size_t bytes) 00110 { cudaBindTexture(0,tex_float_1, ptr, bytes); } 00111 template<> inline void Texture<float,float,2>::bind(const float *ptr, size_t bytes) 00112 { cudaBindTexture(0,tex_float_2, ptr, bytes); } 00113 template<> inline void Texture<float,float,3>::bind(const float *ptr, size_t bytes) 00114 { cudaBindTexture(0,tex_float_3, ptr, bytes); } 00115 template<> inline void Texture<float,float,4>::bind(const float *ptr, size_t bytes) 00116 { cudaBindTexture(0,tex_float_4, ptr, bytes); } 00117 00118 template<> inline void Texture<float2,float2,0>::bind(const float2 *ptr, size_t bytes) 00119 { cudaBindTexture(0,tex_float2_0, ptr, bytes); } 00120 template<> inline void Texture<float2,float2,1>::bind(const float2 *ptr, size_t bytes) 00121 { cudaBindTexture(0,tex_float2_1, ptr, bytes); } 00122 template<> inline void Texture<float2,float2,2>::bind(const float2 *ptr, size_t bytes) 00123 { cudaBindTexture(0,tex_float2_2, ptr, bytes); } 00124 template<> inline void Texture<float2,float2,3>::bind(const float2 *ptr, size_t bytes) 00125 { cudaBindTexture(0,tex_float2_3, ptr, bytes); } 00126 template<> inline void Texture<float2,float2,4>::bind(const float2 *ptr, size_t bytes) 00127 { cudaBindTexture(0,tex_float2_4, ptr, bytes); } 00128 00129 template<> inline void Texture<float4,float4,0>::bind(const float4 *ptr, size_t bytes) 00130 { cudaBindTexture(0,tex_float4_0, ptr, bytes); } 00131 template<> inline void Texture<float4,float4,1>::bind(const float4 *ptr, size_t bytes) 00132 { cudaBindTexture(0,tex_float4_1, ptr, bytes); } 00133 template<> inline void Texture<float4,float4,2>::bind(const float4 *ptr, size_t bytes) 00134 { cudaBindTexture(0,tex_float4_2, ptr, bytes); } 00135 template<> inline void Texture<float4,float4,3>::bind(const float4 *ptr, size_t bytes) 00136 { cudaBindTexture(0,tex_float4_3, ptr, bytes); } 00137 template<> inline void Texture<float4,float4,4>::bind(const float4 *ptr, size_t bytes) 00138 { cudaBindTexture(0,tex_float4_4, ptr, bytes); } 00139 00140 template<> inline void Texture<double2,double2,0>::bind(const double2 *ptr, size_t bytes) 00141 { cudaBindTexture(0,tex_int4_0, ptr, bytes); } 00142 template<> inline void Texture<double2,double2,1>::bind(const double2 *ptr, size_t bytes) 00143 { cudaBindTexture(0,tex_int4_1, ptr, bytes); } 00144 template<> inline void Texture<double2,double2,2>::bind(const double2 *ptr, size_t bytes) 00145 { cudaBindTexture(0,tex_int4_2, ptr, bytes); } 00146 template<> inline void Texture<double2,double2,3>::bind(const double2 *ptr, size_t bytes) 00147 { cudaBindTexture(0,tex_int4_3, ptr, bytes); } 00148 template<> inline void Texture<double2,double2,4>::bind(const double2 *ptr, size_t bytes) 00149 { cudaBindTexture(0,tex_int4_4, ptr, bytes); } 00150 00151 template<> inline void Texture<float2,double2,0>::bind(const double2 *ptr, size_t bytes) 00152 { cudaBindTexture(0,tex_int4_0, ptr, bytes); } 00153 template<> inline void Texture<float2,double2,1>::bind(const double2 *ptr, size_t bytes) 00154 { cudaBindTexture(0,tex_int4_1, ptr, bytes); } 00155 template<> inline void Texture<float2,double2,2>::bind(const double2 *ptr, size_t bytes) 00156 { cudaBindTexture(0,tex_int4_2, ptr, bytes); } 00157 template<> inline void Texture<float2,double2,3>::bind(const double2 *ptr, size_t bytes) 00158 { cudaBindTexture(0,tex_int4_3, ptr, bytes); } 00159 template<> inline void Texture<float2,double2,4>::bind(const double2 *ptr, size_t bytes) 00160 { cudaBindTexture(0,tex_int4_4, ptr, bytes); } 00161 00162 template<> inline void Texture<float2,short2,0>::unbind() { cudaUnbindTexture(tex_short2_0); } 00163 template<> inline void Texture<float2,short2,1>::unbind() { cudaUnbindTexture(tex_short2_1); } 00164 template<> inline void Texture<float2,short2,2>::unbind() { cudaUnbindTexture(tex_short2_2); } 00165 template<> inline void Texture<float2,short2,3>::unbind() { cudaUnbindTexture(tex_short2_3); } 00166 template<> inline void Texture<float2,short2,4>::unbind() { cudaUnbindTexture(tex_short2_4); } 00167 00168 template<> inline void Texture<float4,short4,0>::unbind() { cudaUnbindTexture(tex_short4_0); } 00169 template<> inline void Texture<float4,short4,1>::unbind() { cudaUnbindTexture(tex_short4_1); } 00170 template<> inline void Texture<float4,short4,2>::unbind() { cudaUnbindTexture(tex_short4_2); } 00171 template<> inline void Texture<float4,short4,3>::unbind() { cudaUnbindTexture(tex_short4_3); } 00172 template<> inline void Texture<float4,short4,4>::unbind() { cudaUnbindTexture(tex_short4_4); } 00173 00174 template<> inline void Texture<float,float,0>::unbind() { cudaUnbindTexture(tex_float_0); } 00175 template<> inline void Texture<float,float,1>::unbind() { cudaUnbindTexture(tex_float_1); } 00176 template<> inline void Texture<float,float,2>::unbind() { cudaUnbindTexture(tex_float_2); } 00177 template<> inline void Texture<float,float,3>::unbind() { cudaUnbindTexture(tex_float_3); } 00178 template<> inline void Texture<float,float,4>::unbind() { cudaUnbindTexture(tex_float_4); } 00179 00180 template<> inline void Texture<float2,float2,0>::unbind() { cudaUnbindTexture(tex_float2_0); } 00181 template<> inline void Texture<float2,float2,1>::unbind() { cudaUnbindTexture(tex_float2_1); } 00182 template<> inline void Texture<float2,float2,2>::unbind() { cudaUnbindTexture(tex_float2_2); } 00183 template<> inline void Texture<float2,float2,3>::unbind() { cudaUnbindTexture(tex_float2_3); } 00184 template<> inline void Texture<float2,float2,4>::unbind() { cudaUnbindTexture(tex_float2_4); } 00185 00186 template<> inline void Texture<float4,float4,0>::unbind() { cudaUnbindTexture(tex_float4_0); } 00187 template<> inline void Texture<float4,float4,1>::unbind() { cudaUnbindTexture(tex_float4_1); } 00188 template<> inline void Texture<float4,float4,2>::unbind() { cudaUnbindTexture(tex_float4_2); } 00189 template<> inline void Texture<float4,float4,3>::unbind() { cudaUnbindTexture(tex_float4_3); } 00190 template<> inline void Texture<float4,float4,4>::unbind() { cudaUnbindTexture(tex_float4_4); } 00191 00192 template<> inline void Texture<double2,double2,0>::unbind() { cudaUnbindTexture(tex_int4_0); } 00193 template<> inline void Texture<double2,double2,1>::unbind() { cudaUnbindTexture(tex_int4_1); } 00194 template<> inline void Texture<double2,double2,2>::unbind() { cudaUnbindTexture(tex_int4_2); } 00195 template<> inline void Texture<double2,double2,3>::unbind() { cudaUnbindTexture(tex_int4_3); } 00196 template<> inline void Texture<double2,double2,4>::unbind() { cudaUnbindTexture(tex_int4_4); } 00197 00198 template<> inline void Texture<float2,double2,0>::unbind() { cudaUnbindTexture(tex_int4_0); } 00199 template<> inline void Texture<float2,double2,1>::unbind() { cudaUnbindTexture(tex_int4_1); } 00200 template<> inline void Texture<float2,double2,2>::unbind() { cudaUnbindTexture(tex_int4_2); } 00201 template<> inline void Texture<float2,double2,3>::unbind() { cudaUnbindTexture(tex_int4_3); } 00202 template<> inline void Texture<float2,double2,4>::unbind() { cudaUnbindTexture(tex_int4_4); } 00203 00204 // short2 00205 template<> __device__ inline float2 Texture<float2,short2,0>::fetch(unsigned int idx) 00206 { return tex1Dfetch(tex_short2_0,idx); } 00207 template<> __device__ inline float2 Texture<float2,short2,1>::fetch(unsigned int idx) 00208 { return tex1Dfetch(tex_short2_1,idx); } 00209 template<> __device__ inline float2 Texture<float2,short2,2>::fetch(unsigned int idx) 00210 { return tex1Dfetch(tex_short2_2,idx); } 00211 template<> __device__ inline float2 Texture<float2,short2,3>::fetch(unsigned int idx) 00212 { return tex1Dfetch(tex_short2_3,idx); } 00213 template<> __device__ inline float2 Texture<float2,short2,4>::fetch(unsigned int idx) 00214 { return tex1Dfetch(tex_short2_4,idx); } 00215 00216 // short4 00217 template<> __device__ inline float4 Texture<float4,short4,0>::fetch(unsigned int idx) 00218 { return tex1Dfetch(tex_short4_0,idx); } 00219 template<> __device__ inline float4 Texture<float4,short4,1>::fetch(unsigned int idx) 00220 { return tex1Dfetch(tex_short4_1,idx); } 00221 template<> __device__ inline float4 Texture<float4,short4,2>::fetch(unsigned int idx) 00222 { return tex1Dfetch(tex_short4_2,idx); } 00223 template<> __device__ inline float4 Texture<float4,short4,3>::fetch(unsigned int idx) 00224 { return tex1Dfetch(tex_short4_3,idx); } 00225 template<> __device__ inline float4 Texture<float4,short4,4>::fetch(unsigned int idx) 00226 { return tex1Dfetch(tex_short4_4,idx); } 00227 00228 // float 00229 template<> __device__ inline float Texture<float,float,0>::fetch(unsigned int idx) 00230 { return tex1Dfetch(tex_float_0,idx); } 00231 template<> __device__ inline float Texture<float,float,1>::fetch(unsigned int idx) 00232 { return tex1Dfetch(tex_float_1,idx); } 00233 template<> __device__ inline float Texture<float,float,2>::fetch(unsigned int idx) 00234 { return tex1Dfetch(tex_float_2,idx); } 00235 template<> __device__ inline float Texture<float,float,3>::fetch(unsigned int idx) 00236 { return tex1Dfetch(tex_float_3,idx); } 00237 template<> __device__ inline float Texture<float,float,4>::fetch(unsigned int idx) 00238 { return tex1Dfetch(tex_float_4,idx); } 00239 00240 // float2 00241 template<> __device__ inline float2 Texture<float2,float2,0>::fetch(unsigned int idx) 00242 { return tex1Dfetch(tex_float2_0,idx); } 00243 template<> __device__ inline float2 Texture<float2,float2,1>::fetch(unsigned int idx) 00244 { return tex1Dfetch(tex_float2_1,idx); } 00245 template<> __device__ inline float2 Texture<float2,float2,2>::fetch(unsigned int idx) 00246 { return tex1Dfetch(tex_float2_2,idx); } 00247 template<> __device__ inline float2 Texture<float2,float2,3>::fetch(unsigned int idx) 00248 { return tex1Dfetch(tex_float2_3,idx); } 00249 template<> __device__ inline float2 Texture<float2,float2,4>::fetch(unsigned int idx) 00250 { return tex1Dfetch(tex_float2_4,idx); } 00251 00252 // float4 00253 template<> __device__ inline float4 Texture<float4,float4,0>::fetch(unsigned int idx) 00254 { return tex1Dfetch(tex_float4_0,idx); } 00255 template<> __device__ inline float4 Texture<float4,float4,1>::fetch(unsigned int idx) 00256 { return tex1Dfetch(tex_float4_1,idx); } 00257 template<> __device__ inline float4 Texture<float4,float4,2>::fetch(unsigned int idx) 00258 { return tex1Dfetch(tex_float4_2,idx); } 00259 template<> __device__ inline float4 Texture<float4,float4,3>::fetch(unsigned int idx) 00260 { return tex1Dfetch(tex_float4_3,idx); } 00261 template<> __device__ inline float4 Texture<float4,float4,4>::fetch(unsigned int idx) 00262 { return tex1Dfetch(tex_float4_4,idx); } 00263 00264 // double2 00265 #ifndef FERMI_NO_DBLE_TEX 00266 template<> __device__ inline double2 Texture<double2,double2,0>::fetch(unsigned int idx) 00267 { return fetch_double2(tex_int4_0,idx); } 00268 template<> __device__ inline double2 Texture<double2,double2,1>::fetch(unsigned int idx) 00269 { return fetch_double2(tex_int4_1,idx); } 00270 template<> __device__ inline double2 Texture<double2,double2,2>::fetch(unsigned int idx) 00271 { return fetch_double2(tex_int4_2,idx); } 00272 template<> __device__ inline double2 Texture<double2,double2,3>::fetch(unsigned int idx) 00273 { return fetch_double2(tex_int4_3,idx); } 00274 template<> __device__ inline double2 Texture<double2,double2,4>::fetch(unsigned int idx) 00275 { return fetch_double2(tex_int4_4,idx); } 00276 00277 template<> __device__ inline float2 Texture<float2,double2,0>::fetch(unsigned int idx) 00278 { double2 x = fetch_double2(tex_int4_0,idx); return make_float2(x.x, x.y); } 00279 template<> __device__ inline float2 Texture<float2,double2,1>::fetch(unsigned int idx) 00280 { double2 x = fetch_double2(tex_int4_1,idx); return make_float2(x.x, x.y); } 00281 template<> __device__ inline float2 Texture<float2,double2,2>::fetch(unsigned int idx) 00282 { double2 x = fetch_double2(tex_int4_2,idx); return make_float2(x.x, x.y); } 00283 template<> __device__ inline float2 Texture<float2,double2,3>::fetch(unsigned int idx) 00284 { double2 x = fetch_double2(tex_int4_3,idx); return make_float2(x.x, x.y); } 00285 template<> __device__ inline float2 Texture<float2,double2,4>::fetch(unsigned int idx) 00286 { double2 x = fetch_double2(tex_int4_4,idx); return make_float2(x.x, x.y); } 00287 00288 #else 00289 00290 template<> __device__ inline double2 Texture<double2,double2,0>::fetch(unsigned int idx) 00291 { return spinor[idx]; } 00292 template<> __device__ inline double2 Texture<double2,double2,1>::fetch(unsigned int idx) 00293 { return spinor[idx]; } 00294 template<> __device__ inline double2 Texture<double2,double2,2>::fetch(unsigned int idx) 00295 { return spinor[idx]; } 00296 template<> __device__ inline double2 Texture<double2,double2,3>::fetch(unsigned int idx) 00297 { return spinor[idx]; } 00298 template<> __device__ inline double2 Texture<double2,double2,4>::fetch(unsigned int idx) 00299 { return spinor[idx]; } 00300 00301 template<> __device__ inline float2 Texture<float2,double2,0>::fetch(unsigned int idx) 00302 { double2 x = spinor[idx]; return make_float2(x.x, x.y); } 00303 template<> __device__ inline float2 Texture<float2,double2,1>::fetch(unsigned int idx) 00304 { double2 x = spinor[idx]; return make_float2(x.x, x.y); } 00305 template<> __device__ inline float2 Texture<float2,double2,2>::fetch(unsigned int idx) 00306 { double2 x = spinor[idx]; return make_float2(x.x, x.y); } 00307 template<> __device__ inline float2 Texture<float2,double2,3>::fetch(unsigned int idx) 00308 { double2 x = spinor[idx]; return make_float2(x.x, x.y); } 00309 template<> __device__ inline float2 Texture<float2,double2,4>::fetch(unsigned int idx) 00310 { double2 x = spinor[idx]; return make_float2(x.x, x.y); } 00311 00312 #endif // FERMI_NO_DBLE_TEX 00313 00325 template <typename RegType, typename InterType, typename StoreType> 00326 void checkTypes() { 00327 00328 const size_t reg_size = sizeof(((RegType*)0)->x); 00329 const size_t inter_size = sizeof(((InterType*)0)->x); 00330 const size_t store_size = sizeof(((StoreType*)0)->x); 00331 00332 if (reg_size != inter_size && store_size != 2 && inter_size != 4) 00333 errorQuda("Precision of register (%lu) and intermediate (%lu) types must match\n", 00334 reg_size, inter_size); 00335 00336 if (vecLength<InterType>() != vecLength<StoreType>()) { 00337 errorQuda("Vector lengths intermediate and register types must match\n"); 00338 } 00339 00340 if (vecLength<RegType>() == 0) errorQuda("Vector type not supported\n"); 00341 if (vecLength<InterType>() == 0) errorQuda("Vector type not supported\n"); 00342 if (vecLength<StoreType>() == 0) errorQuda("Vector type not supported\n"); 00343 00344 } 00345 00346 // FIXME: Can we merge the Spinor and SpinorTexture objects so that 00347 // reading from texture is simply a constructor option? 00348 00349 // the number of elements per virtual register 00350 #define REG_LENGTH (sizeof(RegType) / sizeof(((RegType*)0)->x)) 00351 00359 template <typename RegType, typename InterType, typename StoreType, int N, int tex_id> 00360 class SpinorTexture { 00361 00362 private: 00363 Texture<InterType, StoreType, tex_id> spinor; 00364 /* It's faster to always use direct reads for the norm, but leave 00365 this option in there for the future.*/ 00366 #if (__COMPUTE_CAPABILITY__ >= 000) 00367 float *norm; 00368 #else 00369 Texture<float, float, tex_id> norm; 00370 #endif 00371 int stride; 00372 00373 public: 00374 00375 #if (__COMPUTE_CAPABILITY__ >= 000) 00376 SpinorTexture() 00377 : spinor((StoreType*)0, 0), norm(0), stride(0) {;} // default constructor 00378 00379 SpinorTexture(const cudaColorSpinorField &x) 00380 : spinor((StoreType*)x.V(), x.Bytes()), norm((float*)x.Norm()), 00381 stride(x.Length()/(N*REG_LENGTH)) { checkTypes<RegType,InterType,StoreType>(); } 00382 #else 00383 SpinorTexture() 00384 : spinor((StoreType*)0, 0), norm(0, 0), stride(0) {;} // default constructor 00385 00386 SpinorTexture(const cudaColorSpinorField &x) 00387 : spinor((StoreType*)x.V(), x.Bytes()), norm((float*)x.Norm(), x.NormBytes()), 00388 stride(x.Length()/(N*REG_LENGTH)) { checkTypes<RegType,InterType,StoreType>(); } 00389 #endif 00390 00391 ~SpinorTexture() {;} 00392 00393 SpinorTexture& operator=(const SpinorTexture &src) { 00394 if (&src != this) { 00395 spinor = src.spinor; 00396 norm = src.norm; 00397 stride = src.stride; 00398 } 00399 return *this; 00400 } 00401 00402 00403 __device__ inline void load(RegType x[], const int i) { 00404 // load data into registers first using the storage order 00405 const int M = (N * sizeof(RegType)) / sizeof(InterType); 00406 InterType y[M]; 00407 00408 // half precision types 00409 if (sizeof(InterType) == 2*sizeof(StoreType)) { 00410 float xN = norm[i]; 00411 #pragma unroll 00412 for (int j=0; j<M; j++) { 00413 y[j] = spinor[i + j*stride]; 00414 y[j] *= xN; 00415 } 00416 } else { // other types 00417 #pragma unroll 00418 for (int j=0; j<M; j++) copyFloatN(y[j], spinor[i + j*stride]); 00419 } 00420 00421 // now convert into desired register order 00422 convert<RegType, InterType>(x, y, N); 00423 } 00424 00425 // no save method for Textures 00426 00427 QudaPrecision Precision() { 00428 QudaPrecision precision = QUDA_INVALID_PRECISION; 00429 if (sizeof(((StoreType*)0)->x) == sizeof(double)) precision = QUDA_DOUBLE_PRECISION; 00430 else if (sizeof(((StoreType*)0)->x) == sizeof(float)) precision = QUDA_SINGLE_PRECISION; 00431 else if (sizeof(((StoreType*)0)->x) == sizeof(short)) precision = QUDA_HALF_PRECISION; 00432 else errorQuda("Unknown precision type\n"); 00433 return precision; 00434 } 00435 00436 int Stride() { return stride; } 00437 }; 00438 00445 template <typename RegType, typename InterType, typename StoreType, int N> 00446 class Spinor { 00447 00448 private: 00449 StoreType *spinor; 00450 float *norm; 00451 const int stride; 00452 00453 public: 00454 Spinor(cudaColorSpinorField &x) : 00455 spinor((StoreType*)x.V()), norm((float*)x.Norm()), stride(x.Length()/(N*REG_LENGTH)) 00456 { checkTypes<RegType,InterType,StoreType>(); } 00457 00458 Spinor(const cudaColorSpinorField &x) : 00459 spinor((StoreType*)x.V()), norm((float*)x.Norm()), stride(x.Length()/(N*REG_LENGTH)) 00460 { checkTypes<RegType,InterType,StoreType>(); } 00461 ~Spinor() {;} 00462 00463 // default load used for simple fields 00464 __device__ inline void load(RegType x[], const int i) { 00465 // load data into registers first 00466 const int M = (N * sizeof(RegType)) / sizeof(InterType); 00467 InterType y[M]; 00468 #pragma unroll 00469 for (int j=0; j<M; j++) copyFloatN(y[j],spinor[i + j*stride]); 00470 00471 convert<RegType, InterType>(x, y, N); 00472 } 00473 00474 // default store used for simple fields 00475 __device__ inline void save(RegType x[], int i) { 00476 const int M = (N * sizeof(RegType)) / sizeof(InterType); 00477 InterType y[M]; 00478 convert<InterType, RegType>(y, x, M); 00479 #pragma unroll 00480 for (int j=0; j<M; j++) copyFloatN(spinor[i+j*stride], y[j]); 00481 } 00482 00483 // used to backup the field to the host 00484 void save(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes) { 00485 *spinor_h = new char[bytes]; 00486 cudaMemcpy(*spinor_h, spinor, bytes, cudaMemcpyDeviceToHost); 00487 if (norm_bytes > 0) { 00488 *norm_h = new char[norm_bytes]; 00489 cudaMemcpy(*norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost); 00490 } 00491 checkCudaError(); 00492 } 00493 00494 // restore the field from the host 00495 void load(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes) { 00496 cudaMemcpy(spinor, *spinor_h, bytes, cudaMemcpyHostToDevice); 00497 if (norm_bytes > 0) { 00498 cudaMemcpy(norm, *norm_h, norm_bytes, cudaMemcpyHostToDevice); 00499 delete(*norm_h); 00500 } 00501 delete(*spinor_h); 00502 checkCudaError(); 00503 } 00504 00505 void* V() { return (void*)spinor; } 00506 float* Norm() { return norm; } 00507 QudaPrecision Precision() { 00508 QudaPrecision precision = QUDA_INVALID_PRECISION; 00509 if (sizeof(((StoreType*)0)->x) == sizeof(double)) precision = QUDA_DOUBLE_PRECISION; 00510 else if (sizeof(((StoreType*)0)->x) == sizeof(float)) precision = QUDA_SINGLE_PRECISION; 00511 else if (sizeof(((StoreType*)0)->x) == sizeof(short)) precision = QUDA_HALF_PRECISION; 00512 else errorQuda("Unknown precision type\n"); 00513 return precision; 00514 } 00515 00516 int Stride() { return stride; } 00517 }; 00518 00519 template <typename OutputType, typename InputType, int M> 00520 __device__ inline void saveHalf(OutputType *x_o, float *norm, InputType x_i[M], int i, int stride) { 00521 float c[M]; 00522 #pragma unroll 00523 for (int j=0; j<M; j++) c[j] = max_fabs(x_i[j]); 00524 #pragma unroll 00525 for (int j=1; j<M; j++) c[0] = fmaxf(c[j],c[0]); 00526 00527 norm[i] = c[0]; // store norm value 00528 00529 // store spinor values 00530 float C = __fdividef(MAX_SHORT, c[0]); 00531 #pragma unroll 00532 for (int j=0; j<M; j++) { 00533 x_o[i+j*stride] = make_shortN(C*x_i[j]); 00534 } 00535 } 00536 00537 template <> 00538 __device__ inline void Spinor<float2, float2, short2, 3>::save(float2 x[3], int i) { 00539 saveHalf<short2, float2, 3>(spinor, norm, x, i, stride); 00540 } 00541 00542 template <> 00543 __device__ inline void Spinor<float4, float4, short4, 6>::save(float4 x[6], int i) { 00544 saveHalf<short4, float4, 6>(spinor, norm, x, i, stride); 00545 } 00546 00547 template <> 00548 __device__ inline void Spinor<double2, double2, short2, 3>::save(double2 x[3], int i) { 00549 saveHalf<short2, double2, 3>(spinor, norm, x, i, stride); 00550 } 00551 00552 template <> 00553 __device__ inline void Spinor<double2, double4, short4, 12>::save(double2 x[12], int i) { 00554 double4 y[6]; 00555 convert<double4, double2>(y, x, 6); 00556 saveHalf<short4, double4, 6>(spinor, norm, y, i, stride); 00557 } 00558