10 #ifdef USE_TEXTURE_OBJECTS
12 template<
typename OutputType,
typename InputType>
16 #ifndef DIRECT_ACCESS_BLAS
17 cudaTextureObject_t
spinor;
24 #ifndef DIRECT_ACCESS_BLAS
27 Texture(
const cudaColorSpinorField *
x) :
spinor((InputType*)(x->
V())) { }
33 if (
this != &tex)
spinor = tex.spinor;
37 #ifndef DIRECT_ACCESS_BLAS
38 __device__
inline OutputType
fetch(
unsigned int idx)
39 {
return tex1Dfetch<OutputType>(
spinor,
idx); }
41 __device__
inline OutputType
fetch(
unsigned int idx)
48 #ifndef DIRECT_ACCESS_BLAS
50 {
return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); }
63 #if (__COMPUTE_CAPABILITY__ >= 130)
64 __inline__ __device__ double2
fetch_double2(texture<int4, 1> t,
int i)
66 int4 v = tex1Dfetch(t,i);
67 return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
73 return make_double2(0.0, 0.0);
77 #define MAX_TEXELS (1<<27)
79 template<
typename OutputType,
typename InputType,
int tex_
id>
82 #ifdef DIRECT_ACCESS_BLAS
91 #ifdef DIRECT_ACCESS_BLAS
97 #ifdef DIRECT_ACCESS_BLAS
98 :
spinor((
const InputType*)x->V()), bytes(x->Bytes())
102 if (x->Bytes()) {
bind((
const InputType*)x->V(), x->Bytes()); bound =
true; }
107 #ifdef DIRECT_ACCESS_BLAS
108 :
spinor(tex.spinor), bytes(tex.bytes)
115 #ifdef DIRECT_ACCESS_BLAS
122 inline void bind(
const InputType*,
size_t bytes){ }
126 __device__
inline OutputType
fetch(
unsigned int idx) { OutputType
x; x.x =0;
return x; };
130 template<
typename OutputType,
typename InputType,
int tex_
id>
133 template<
typename OutputType,
typename InputType,
int tex_
id>
136 #define DECL_TEX(id) \
137 texture<short2,1,cudaReadModeNormalizedFloat> tex_short2_##id; \
138 texture<short4,1,cudaReadModeNormalizedFloat> tex_short4_##id; \
139 texture<float,1> tex_float_##id; \
140 texture<float2,1> tex_float2_##id; \
141 texture<float4,1> tex_float4_##id; \
142 texture<int4,1> tex_double2_##id;
145 #define DEF_BIND_UNBIND(outtype, intype, id) \
146 template<> inline void Texture<outtype,intype,id>::bind(const intype *ptr, size_t bytes) \
147 { cudaBindTexture(0,tex_##intype##_##id, ptr, bytes); } \
148 template<> inline void Texture<outtype,intype,id>::unbind() { cudaUnbindTexture(tex_##intype##_##id); }
151 #define DEF_FETCH_TEX(outtype, intype, id) \
152 template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
153 { return tex1Dfetch(tex_##intype##_##id,idx); }
156 #define DEF_FETCH_DIRECT(outtype, intype, id) \
157 template<> __device__ inline outtype Texture<outtype,intype,id>::fetch(unsigned int idx) \
158 { outtype out; copyFloatN(out, spinor[idx]); return out; }
161 #if defined(DIRECT_ACCESS_BLAS)
162 #define DEF_FETCH DEF_FETCH_DIRECT
164 #define DEF_FETCH DEF_FETCH_TEX
168 #if defined(DIRECT_ACCESS_BLAS) || defined(FERMI_NO_DBLE_TEX)
169 #define DEF_FETCH_DBLE DEF_FETCH_DIRECT
171 #define DEF_FETCH_DBLE(outtype, intype, id) \
172 template<> __device__ inline outtype Texture<outtype,double2,id>::fetch(unsigned int idx) \
173 { outtype out; copyFloatN(out, fetch_double2(tex_double2_##id,idx)); return out; }
177 #define DEF_BIND_UNBIND_FETCH(outtype, intype, id) \
178 DEF_BIND_UNBIND(outtype, intype, id) \
179 DEF_FETCH(outtype, intype, id)
182 #define DEF_ALL(id) \
184 DEF_BIND_UNBIND_FETCH(float2, short2, id) \
185 DEF_BIND_UNBIND_FETCH(float4, short4, id) \
186 DEF_BIND_UNBIND_FETCH(float, float, id) \
187 DEF_BIND_UNBIND_FETCH(float2, float2, id) \
188 DEF_BIND_UNBIND_FETCH(float4, float4, id) \
189 DEF_BIND_UNBIND(double2, double2, id) \
190 DEF_BIND_UNBIND(float2, double2, id) \
191 DEF_FETCH_DBLE(double2, double2, id) \
192 DEF_FETCH_DBLE(float2, double2, id)
206 #undef DEF_BIND_UNBIND
207 #undef DEF_FETCH_DIRECT
210 #undef DEF_FETCH_DBLE
211 #undef DEF_BIND_UNBIND_FETCH
214 #endif // USE_TEXTURE_OBJECTS
228 template <
typename RegType,
typename InterType,
typename StoreType>
231 const size_t reg_size =
sizeof(((RegType*)0)->x);
232 const size_t inter_size =
sizeof(((InterType*)0)->x);
233 const size_t store_size =
sizeof(((StoreType*)0)->x);
235 if (reg_size != inter_size && store_size != 2 && inter_size != 4)
236 errorQuda(
"Precision of register (%lu) and intermediate (%lu) types must match\n",
237 (
unsigned long)reg_size, (
unsigned long)inter_size);
239 if (vecLength<InterType>() != vecLength<StoreType>()) {
240 errorQuda(
"Vector lengths intermediate and register types must match\n");
243 if (vecLength<RegType>() == 0)
errorQuda(
"Vector type not supported\n");
244 if (vecLength<InterType>() == 0)
errorQuda(
"Vector type not supported\n");
245 if (vecLength<StoreType>() == 0)
errorQuda(
"Vector type not supported\n");
249 template <
typename FloatN,
int M>
253 for (
int j=0; j<M; j++) c[j] =
max_fabs(x[j]);
255 for (
int j=1; j<M; j++) c[0] = fmaxf(c[j],c[0]);
261 #define REG_LENGTH (sizeof(RegType) / sizeof(((RegType*)0)->x))
264 #define IS_SHORT(type) (sizeof( ((type*)0)->x ) == sizeof(short) )
274 template <
typename RegType,
typename InterType,
typename StoreType,
int N,
int write,
int tex_id=-1>
279 #ifdef USE_TEXTURE_OBJECTS // texture objects
289 : spinor(0), tex(), norm(0), stride(0) { }
292 : spinor((StoreType*)x.
V()), tex(&x), norm((float*)x.
Norm()),
293 stride(x.Length()/(N*
REG_LENGTH)) { checkTypes<RegType,InterType,StoreType>(); }
296 : spinor(st.spinor), tex(st.tex), norm(st.norm), stride(st.stride) { }
298 Spinor(StoreType* spinor,
float* norm,
int stride)
299 : spinor(spinor), norm(norm), stride(stride) { checkTypes<RegType, InterType, StoreType>(); }
313 __device__
inline void load(RegType x[],
const int i) {
315 const int M = (N *
sizeof(RegType)) /
sizeof(InterType);
319 #ifndef USE_TEXTURE_OBJECTS
326 for (
int j=0; j<M; j++) y[j] = xN*tex[i + j*stride];
329 for (
int j=0; j<M; j++)
copyFloatN(y[j], tex[i + j*stride]);
331 #ifndef USE_TEXTURE_OBJECTS
337 for (
int j=0; j<M; j++) {
343 for (
int j=0; j<M; j++)
copyFloatN(y[j],spinor[i + j*stride]);
349 convert<RegType, InterType>(
x, y, N);
353 __device__
inline void save(RegType x[],
int i) {
355 const int M = (N *
sizeof(RegType)) /
sizeof(InterType);
357 convert<InterType, RegType>(y,
x, M);
360 float C = store_norm<InterType, M>(norm, y, i);
362 for (
int j=0; j<M; j++)
copyFloatN(spinor[i+j*stride], C*y[j]);
365 for (
int j=0; j<M; j++)
copyFloatN(spinor[i+j*stride], y[j]);
371 void save(
char **spinor_h,
char **norm_h,
size_t bytes,
size_t norm_bytes) {
373 *spinor_h =
new char[bytes];
374 cudaMemcpy(*spinor_h, spinor, bytes, cudaMemcpyDeviceToHost);
375 if (norm_bytes > 0) {
376 *norm_h =
new char[norm_bytes];
377 cudaMemcpy(*norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
384 void load(
char **spinor_h,
char **norm_h,
size_t bytes,
size_t norm_bytes) {
386 cudaMemcpy(spinor, *spinor_h, bytes, cudaMemcpyHostToDevice);
387 if (norm_bytes > 0) {
388 cudaMemcpy(norm, *norm_h, norm_bytes, cudaMemcpyHostToDevice);
398 void*
V() {
return (
void*)spinor; }
406 else errorQuda(
"Unknown precision type\n");
415 #ifndef USE_TEXTURE_OBJECTS