12 #ifdef USE_TEXTURE_OBJECTS
14 template<
typename OutputType,
typename InputType>
20 #ifndef DIRECT_ACCESS_BLAS
21 cudaTextureObject_t
spinor;
28 #ifndef DIRECT_ACCESS_BLAS
31 Texture(
const cudaColorSpinorField *
x) :
spinor((InputType*)(x->
V())) { }
37 if (
this != &tex)
spinor = tex.spinor;
41 #ifndef DIRECT_ACCESS_BLAS
42 __device__
inline OutputType
fetch(
unsigned int idx)
49 __device__
inline OutputType
fetch(
unsigned int idx)
53 __device__
inline OutputType
operator[](
unsigned int idx) {
return fetch(idx); }
56 #ifndef DIRECT_ACCESS_BLAS
58 {
return __hiloint2double(v.y, v.x); }
61 {
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); }
74 #if (__COMPUTE_CAPABILITY__ >= 130)
76 __inline__ __device__
double fetch_double(texture<int2, 1> t,
int i)
78 int2 v = tex1Dfetch(t,i);
79 return __hiloint2double(v.y, v.x);
82 __inline__ __device__ double2
fetch_double2(texture<int4, 1> t,
int i)
84 int4 v = tex1Dfetch(t,i);
85 return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
88 __inline__ __device__
double fetch_double(texture<int2, 1> t,
int i){
return 0.0; }
93 return make_double2(0.0, 0.0);
97 #define MAX_TEXELS (1<<27)
104 template<
typename OutputType,
typename InputType,
int tex_
id>
108 #ifdef DIRECT_ACCESS_BLAS
117 #ifdef DIRECT_ACCESS_BLAS
123 #ifdef DIRECT_ACCESS_BLAS
124 :
spinor((
const InputType*)x->V()), bytes(x->Bytes())
129 if (tex_id > 0 && tex_id <=
MAX_TEX_ID && tex_id_table[tex_id]) {
130 errorQuda(
"Already bound to this texture reference");
132 tex_id_table[tex_id] =
true;
134 bind((
const InputType*)x->V(), x->Bytes()); bound =
true;
140 #ifdef DIRECT_ACCESS_BLAS
141 :
spinor(tex.spinor), bytes(tex.bytes)
146 unbind(); bound =
false; tex_id_table[tex_id]=
false;
150 #ifdef DIRECT_ACCESS_BLAS
157 inline void bind(
const InputType*,
size_t bytes){ }
161 __device__
inline OutputType
fetch(
unsigned int idx) { OutputType
x; x.x =0;
return x; };
165 template<
typename OutputType,
typename InputType,
int tex_
id>
168 template<
typename OutputType,
typename InputType,
int tex_
id>
171 #define DECL_TEX(id) \
172 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_##id; \
173 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_##id; \
174 texture<float,1> tex_float_##id; \
175 texture<float2,1> tex_float2_##id; \
176 texture<float4,1> tex_float4_##id; \
177 texture<int4,1> tex_double2_##id;
180 #define DEF_BIND_UNBIND(outtype, intype, id) \
181 template<> inline void Texture<outtype,intype,id>::bind(const intype *ptr, size_t bytes) \
182 { cudaBindTexture(0,tex_##intype##_##id, ptr, bytes); } \
183 template<> inline void Texture<outtype,intype,id>::unbind() { cudaUnbindTexture(tex_##intype##_##id); }
186 #define DEF_FETCH_TEX(outtype, intype, id) \
187 template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
188 { return tex1Dfetch(tex_##intype##_##id,idx); }
191 #define DEF_FETCH_DIRECT(outtype, intype, id) \
192 template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
193 { outtype out; copyFloatN(out, spinor[idx]); return out; }
196 #if defined(DIRECT_ACCESS_BLAS)
197 #define DEF_FETCH DEF_FETCH_DIRECT
199 #define DEF_FETCH DEF_FETCH_TEX
203 #if defined(DIRECT_ACCESS_BLAS) || defined(FERMI_NO_DBLE_TEX)
204 #define DEF_FETCH_DBLE DEF_FETCH_DIRECT
206 #define DEF_FETCH_DBLE(outtype, intype, id) \
207 template<> __device__ inline outtype Texture<outtype,double2,id>::fetch(unsigned int idx) \
208 { outtype out; copyFloatN(out, fetch_double2(tex_double2_##id,idx)); return out; }
211 #if defined(DIRECT_ACCESS_BLAS) || defined(FERMI_NO_DBLE_TEX)
212 #define DEF_FETCH_DBLE_MIXED DEF_FETCH_DIRECT
214 #define DEF_FETCH_DBLE_MIXED(outtype, intype, id) \
215 template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
216 { outtype out; copyFloatN(out, tex1Dfetch(tex_##intype##_##id,idx)); return out; }
220 #define DEF_BIND_UNBIND_FETCH(outtype, intype, id) \
221 DEF_BIND_UNBIND(outtype, intype, id) \
222 DEF_FETCH(outtype, intype, id)
225 #define DEF_ALL(id) \
227 DEF_BIND_UNBIND_FETCH(float2, short2, id) \
228 DEF_BIND_UNBIND_FETCH(float4, short4, id) \
229 DEF_BIND_UNBIND_FETCH(float, float, id) \
230 DEF_BIND_UNBIND_FETCH(float2, float2, id) \
231 DEF_BIND_UNBIND_FETCH(float4, float4, id) \
232 DEF_BIND_UNBIND(double2, double2, id) \
233 DEF_BIND_UNBIND(float2, double2, id) \
234 DEF_FETCH_DBLE(double2, double2, id) \
235 DEF_FETCH_DBLE(float2, double2, id) \
236 DEF_BIND_UNBIND(double2, float2, id) \
237 DEF_BIND_UNBIND(double4, float4, id) \
238 DEF_BIND_UNBIND(double2, short2, id) \
239 DEF_BIND_UNBIND(double4, short4, id) \
240 DEF_FETCH_DBLE_MIXED(double2, float2, id) \
241 DEF_FETCH_DBLE_MIXED(double4, float4, id) \
242 DEF_FETCH_DBLE_MIXED(double2, short2, id) \
243 DEF_FETCH_DBLE_MIXED(double4, short4, id)
253 #undef DEF_BIND_UNBIND
254 #undef DEF_FETCH_DIRECT
257 #undef DEF_FETCH_DBLE
258 #undef DEF_BIND_UNBIND_FETCH
261 #endif // USE_TEXTURE_OBJECTS
275 template <
typename RegType,
typename InterType,
typename StoreType>
278 const size_t reg_size =
sizeof(((RegType*)0)->x);
279 const size_t inter_size =
sizeof(((InterType*)0)->x);
280 const size_t store_size =
sizeof(((StoreType*)0)->x);
282 if (reg_size != inter_size && store_size != 2 && inter_size != 4)
283 errorQuda(
"Precision of register (%lu) and intermediate (%lu) types must match\n",
284 (
unsigned long)reg_size, (
unsigned long)inter_size);
286 if (vecLength<InterType>() != vecLength<StoreType>()) {
287 errorQuda(
"Vector lengths intermediate and register types must match\n");
290 if (vecLength<RegType>() == 0)
errorQuda(
"Vector type not supported\n");
291 if (vecLength<InterType>() == 0)
errorQuda(
"Vector type not supported\n");
292 if (vecLength<StoreType>() == 0)
errorQuda(
"Vector type not supported\n");
296 template <
typename FloatN,
int M>
300 for (
int j=0; j<M; j++) c[j] =
max_fabs(x[j]);
302 for (
int j=1; j<M; j++) c[0] = fmaxf(c[j],c[0]);
308 #define REG_LENGTH (sizeof(RegType) / sizeof(((RegType*)0)->x))
311 #define IS_SHORT(type) (sizeof( ((type*)0)->x ) == sizeof(short) )
321 template <
typename RegType,
typename InterType,
typename StoreType,
int N,
int write,
int tex_id=-1>
326 #ifdef USE_TEXTURE_OBJECTS // texture objects
336 : spinor(0), tex(), norm(0), stride(0) { }
339 : spinor((StoreType*)x.
V()), tex(&x), norm((float*)x.
Norm()),
340 stride(x.Length()/(N*
REG_LENGTH)) { checkTypes<RegType,InterType,StoreType>(); }
343 : spinor(st.spinor), tex(st.tex), norm(st.norm), stride(st.stride) { }
345 Spinor(StoreType* spinor,
float* norm,
int stride)
346 : spinor(spinor), norm(norm), stride(stride) { checkTypes<RegType, InterType, StoreType>(); }
358 void set(
const cudaColorSpinorField &x){
359 spinor = (StoreType*)x.V();
360 #ifdef USE_TEXTURE_OBJECTS
365 norm = (
float*)x.Norm();
368 checkTypes<RegType,InterType,StoreType>();
373 __device__
inline void load(RegType x[],
const int i) {
375 const int M = (N *
sizeof(RegType)) /
sizeof(InterType);
379 #ifndef USE_TEXTURE_OBJECTS
386 for (
int j=0; j<M; j++) y[j] = xN*tex[i + j*stride];
389 for (
int j=0; j<M; j++)
copyFloatN(y[j], tex[i + j*stride]);
391 #ifndef USE_TEXTURE_OBJECTS
397 for (
int j=0; j<M; j++) {
403 for (
int j=0; j<M; j++)
copyFloatN(y[j],spinor[i + j*stride]);
409 convert<RegType, InterType>(
x,
y, N);
413 __device__
inline void save(RegType x[],
int i) {
415 const int M = (N *
sizeof(RegType)) /
sizeof(InterType);
417 convert<InterType, RegType>(
y,
x, M);
420 float C = store_norm<InterType, M>(norm,
y, i);
422 for (
int j=0; j<M; j++)
copyFloatN(spinor[i+j*stride], C*y[j]);
425 for (
int j=0; j<M; j++)
copyFloatN(spinor[i+j*stride], y[j]);
431 void save(
char **spinor_h,
char **norm_h,
size_t bytes,
size_t norm_bytes) {
433 *spinor_h =
new char[bytes];
434 cudaMemcpy(*spinor_h, spinor, bytes, cudaMemcpyDeviceToHost);
435 if (norm_bytes > 0) {
436 *norm_h =
new char[norm_bytes];
437 cudaMemcpy(*norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
444 void load(
char **spinor_h,
char **norm_h,
size_t bytes,
size_t norm_bytes) {
446 cudaMemcpy(spinor, *spinor_h, bytes, cudaMemcpyHostToDevice);
447 if (norm_bytes > 0) {
448 cudaMemcpy(norm, *norm_h, norm_bytes, cudaMemcpyHostToDevice);
458 void*
V() {
return (
void*)spinor; }
466 else errorQuda(
"Unknown precision type\n");
477 #ifndef USE_TEXTURE_OBJECTS
enum QudaPrecision_s QudaPrecision
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Spinor(StoreType *spinor, float *norm, int stride)
void load(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes)
Texture(const cudaColorSpinorField *x)
void set(const cudaColorSpinorField &x)
cpuColorSpinorField * spinor
__inline__ __device__ double fetch_double(texture< int2, 1 > t, int i)
__device__ void load(RegType x[], const int i)
__device__ void copyFloatN(FloatN &a, const FloatN &b)
QudaPrecision Precision()
Spinor(const cudaColorSpinorField &x)
__device__ float store_norm(float *norm, FloatN x[M], int i)
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
Texture(const Texture &tex)
__device__ void save(RegType x[], int i)
cpuColorSpinorField * out
void save(char **spinor_h, char **norm_h, size_t bytes, size_t norm_bytes)
__forceinline__ __host__ __device__ float max_fabs(const float4 &c)
Texture & operator=(const Texture &tex)
__device__ OutputType operator[](unsigned int idx)
Spinor & operator=(const Spinor &src)
void setStride(int stride_)
__device__ OutputType fetch(unsigned int idx)
bool tex_id_table[MAX_TEX_ID]
void bind(const InputType *, size_t bytes)