1 #ifndef _CLOVER_ORDER_H 2 #define _CLOVER_ORDER_H 11 #if __COMPUTE_CAPABILITY__ >= 300 12 #include <trove/ptr.h> 36 template <
typename Float,
typename T>
63 template <
typename T,
int N>
66 a.field.load((T*)data,
a.x_cb,
a.parity,
a.chirality);
69 template <
typename T,
int N>
72 a.field.load((T*)data,
a.x_cb,
a.parity,
a.chirality);
161 template<
typename Float,
int nColor,
int nSpin, QudaCloverFieldOrder order>
struct Accessor {
164 errorQuda(
"Not implemented for order %d", order);
168 int c_row,
int c_col)
const {
177 return (j*stride+
x)*N +
i;
180 template<
typename Float,
int nColor,
int nSpin>
185 static constexpr
int N = nSpin *
nColor / 2;
187 :
a(static_cast<Float*>(const_cast<void*>(A.
V(inverse)))), stride(A.Stride()),
offset(A.Bytes()/(2*sizeof(Float))) { }
189 __device__ __host__
inline complex<Float>
operator()(
int parity,
int x,
int s_row,
int s_col,
int c_row,
int c_col)
const {
191 if (s_col / 2 != s_row / 2) {
return complex<Float>(0.0); }
193 const int chirality = s_col / 2;
195 int row = s_row%2 *
nColor + c_row;
196 int col = s_col%2 *
nColor + c_col;
200 return 2*a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(row, stride,
x) ];
201 }
else if (col < row) {
203 int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
206 return 2*complex<Float>(a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(
idx+0,stride,
x) ],
207 a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(
idx+1,stride,
x) ]);
211 int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
214 return 2*complex<Float>( a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(
idx+0,stride,
x) ],
215 -a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(
idx+1,stride,
x) ]);
222 template<
typename Float,
int nColor,
int nSpin>
229 a[0] =
static_cast<Float*
>(
const_cast<void*
>(A.
V(inverse)));
231 a[1] =
static_cast<Float*
>(
const_cast<void*
>(A.
V(inverse))) + A.
Bytes()/(2*
sizeof(Float));
232 zero = complex<Float>(0.0,0.0);
235 __device__ __host__
inline complex<Float>
operator()(
int parity,
int x,
int s_row,
int s_col,
int c_row,
int c_col)
const {
237 if (s_col / 2 != s_row / 2) {
return zero; }
239 const int chirality = s_col / 2;
241 unsigned int row = s_row%2 *
nColor + c_row;
242 unsigned int col = s_col%2 *
nColor + c_col;
245 complex<Float>
tmp =
a[
parity][(
x*2 + chirality)*N*N + row];
247 }
else if (col < row) {
249 int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
250 int idx = (
x*2 + chirality)*N*N + N + 2*k;
254 int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
255 int idx = (
x*2 + chirality)*N*N + N + 2*k;
267 template <
typename Float,
int nColor,
int nSpin, QudaCloverFieldOrder order>
299 int s_col,
int c_row,
int c_col)
const {
318 int s_col,
int c_row,
int c_col)
const {
351 constexpr
int n = (nSpin *
nColor) / 2;
352 constexpr
int chiral_block =
n *
n / 2;
353 return static_cast<size_t>(
volumeCB) * chiral_block * 2ll * 2ll *
sizeof(Float);
366 template <
typename Float,
int length,
int N,
bool huge_alloc=false>
377 #ifdef USE_TEXTURE_OBJECTS 379 cudaTextureObject_t
tex;
380 cudaTextureObject_t normTex;
381 const int tex_offset;
396 #ifdef USE_TEXTURE_OBJECTS
397 tex(0), normTex(0), tex_offset(
offset/N),
403 this->clover = clover_ ? clover_ : (Float*)(
clover.V(is_inverse));
404 this->norm = norm_ ? norm_ : (
float*)(
clover.Norm(is_inverse));
405 #ifdef USE_TEXTURE_OBJECTS 414 if (!huge_alloc && (this->clover !=
clover.V(is_inverse) ||
416 errorQuda(
"Cannot use texture read since data pointer does not equal field pointer - use with huge_alloc=true instead");
465 for (
int i=0;
i<
M;
i++) {
467 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__) 469 TexVector vecTmp = tex1Dfetch<TexVector>(
tex,
parity*tex_offset +
stride*(chirality*
M+
i) +
x);
472 for (
int j=0; j<N; j++) copy(v[i*N+j], reinterpret_cast<RegType*>(&vecTmp)[j]);
479 for (
int j=0; j<N; j++) copy(v[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
483 if (
sizeof(Float)==
sizeof(short)) {
484 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__) 506 if (
sizeof(Float)==
sizeof(
short)) {
508 for (
int i=0; i<block; i++) scale = fabs(v[i]) > scale ?
fabs(v[
i]) : scale;
513 for (
int i=0;
i<
M;
i++) {
516 if (
sizeof(Float)==
sizeof(
short))
518 for (
int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[
i*N+j] / scale);
521 for (
int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], v[
i*N+j]);
537 for (
int chirality=0; chirality<2; chirality++)
load(&v[chirality*36],
x,
parity, chirality);
549 for (
int chirality=0; chirality<2; chirality++)
save(&v[chirality*36],
x,
parity, chirality);
583 if (
sizeof(Float)==
sizeof(short))
bytes += 2*
sizeof(
float);
594 template <
typename real,
int length>
struct S { real
v[
length]; };
599 template <
typename Float,
int length>
613 this->clover = clover_ ? clover_ : (Float*)(
clover.V(inverse));
621 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 623 trove::coalesced_ptr<structure> clover_((structure*)
clover);
632 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 634 trove::coalesced_ptr<structure> clover_((structure*)
clover);
636 for (
int i=0;
i<
length;
i++) v_.v[
i] = 2.0*(Float)v[
i];
649 template <
typename Float,
int length>
662 offdiag = clover_ ? ((Float**)clover_)[0] : ((Float**)
clover.V(inverse))[0];
663 diag = clover_ ? ((Float**)clover_)[1] : ((Float**)
clover.V(inverse))[1];
671 for (
int chirality=0; chirality<2; chirality++) {
673 for (
int i=0;
i<6;
i++) {
678 for (
int i=0;
i<30;
i++) {
681 const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
690 for (
int chirality=0; chirality<2; chirality++) {
692 for (
int i=0;
i<6;
i++) {
697 for (
int i=0;
i<30;
i++) {
700 const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
716 template <
typename Float,
int length>
728 this->clover[0] = clover_ ? clover_ : (Float*)(
clover.V(inverse));
729 this->clover[1] = (Float*)((
char*)this->clover[0] +
clover.Bytes()/2);
742 int bq[36] = { 21, 32, 33, 0, 1, 20,
743 28, 29, 30, 31, 6, 7, 14, 15, 22, 23,
744 34, 35, 8, 9, 16, 17, 24, 25,
745 10, 11, 18, 19, 26, 27,
751 for (
int i=0;
i<6;
i++)
sign[
i] = 1;
752 for (
int i=6;
i<36;
i+=2) {
753 if ( (
i >= 10 &&
i<= 15) || (
i >= 18 &&
i <= 29) ) {
sign[
i] = -1;
sign[
i+1] = -1; }
758 for (
int chirality=0; chirality<2; chirality++)
759 for (
int i=0;
i<M;
i++)
788 #endif //_CLOVER_ORDER_H
Accessor(const CloverField &A, bool inverse=false)
AllocType< huge_alloc >::type AllocInt
void * V(bool inverse=false)
__device__ __host__ const complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function.
__device__ __host__ int Ncolor() const
Complex-member accessor function.
mapper< Float >::type RegType
mapper< Float >::type RegType
cudaColorSpinorField * tmp
__device__ __host__ complex< Float > operator()(int dummy, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function. This is a special variant that is compatible with the equ...
void load()
Restore the field from the host after tuning.
mapper< Float >::type RegType
const Accessor< Float, nColor, nSpin, order > accessor
clover::FloatNOrder< float, N, 4 > type
This is just a dummy structure we use for trove to define the required structure size.
__device__ __host__ void save(RegType v[length], int x, int parity)
FieldOrder(CloverField &A, bool inverse=false)
VectorType< Float, N >::type Vector
clover::FloatNOrder< double, N, 2 > type
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__device__ __host__ clover_wrapper< RegType, FloatNOrder< Float, length, N > > operator()(int x_cb, int parity, int chirality)
This accessor routine returns a clover_wrapper to this object, allowing us to overload various operat...
clover::FloatNOrder< short, N, 4 > type
__device__ __host__ int VolumeCB() const
__device__ __host__ void load(RegType v[block], int x, int parity, int chirality) const
Load accessor for a single chiral block.
void * backup_norm_h
host memory for backing up the field when tuning
__device__ __host__ HMatrix()
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
Accessor(const CloverField &A, bool inverse=false)
clover_wrapper is an internal class that is used to wrap instances of colorspinor accessors...
__device__ __host__ void save(const RegType v[length], int x, int parity)
__device__ __host__ int Volume() const
__device__ __host__ void load(RegType v[length], int x, int parity) const
Provides precision abstractions and defines the register precision given the storage precision using ...
__device__ __host__ void save(const RegType v[block], int x, int parity, int chirality)
Store accessor for a single chiral block.
Accessor(const CloverField &A, bool inverse=false)
#define safe_malloc(size)
__device__ __host__ void load(RegType v[length], int x, int parity) const
Load accessor for the clover matrix.
QDPOrder(const CloverField &clover, bool inverse, Float *clover_=0)
mapper< Float >::type RegType
__device__ __host__ void operator=(const C &a)
Assignment operator with H matrix instance as input.
void save()
Backup the field to the host when tuning.
__device__ __host__ void operator=(const HMatrix< U, N > &b)
__device__ __host__ void save(const RegType v[length], int x, int parity)
static __inline__ dim3 dim3 void size_t cudaStream_t int enum cudaTextureReadMode readMode static __inline__ const struct texture< T, dim, readMode > & tex
const AllocInt norm_offset
Accessor routine for CloverFields in native field order.
QDPJITOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__device__ __host__ complex< Float > & operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__device__ __host__ const clover_wrapper< RegType, FloatNOrder< Float, length, N > > operator()(int x_cb, int parity, int chirality) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
BQCDOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__device__ __host__ void load(RegType v[length], int x, int parity) const
__device__ __host__ void zero(vector_type< scalar, n > &v)
__device__ __host__ int indexFloatN(int k, int stride, int x)
__device__ __host__ void load(RegType v[length], int x, int parity) const
FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, float *norm_=0, bool override=false)
host memory for backing up norm when tuning
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__device__ __host__ void save(const RegType v[length], int x, int parity)
Store accessor for the clover matrix.