1 #ifndef _CLOVER_ORDER_H 2 #define _CLOVER_ORDER_H 33 template <
typename Float,
typename T>
55 __device__ __host__
inline void operator=(
const C &a) {
56 field.save(a.data, x_cb, parity, chirality);
60 template <
typename T,
int N>
66 template <
typename T,
int N>
74 template<
typename ReduceType,
typename Float>
struct square_ {
75 __host__ __device__
inline ReduceType
operator()(
const quda::complex<Float> &x)
76 {
return static_cast<ReduceType
>(
norm(x)); }
79 template<
typename ReduceType,
typename Float>
struct abs_ {
80 __host__ __device__
inline ReduceType
operator()(
const quda::complex<Float> &x)
81 {
return static_cast<ReduceType
>(
abs(x)); }
168 template<
typename Float,
int nColor,
int nSpin, QudaCloverFieldOrder order>
struct Accessor {
171 errorQuda(
"Not implemented for order %d", order);
174 __device__ __host__
inline complex<Float>&
operator()(
int parity,
int x,
int s_row,
int s_col,
175 int c_row,
int c_col)
const {
179 template<
typename helper,
typename reducer>
186 template<
typename Float,
int nColor,
int nSpin>
191 static constexpr
int N = nSpin *
nColor / 2;
193 : a(static_cast<Float*>(const_cast<void*>(A.
V(
inverse)))), stride(A.Stride()),
194 offset_cb(A.Bytes()/(2*sizeof(Float))) { }
196 __device__ __host__
inline complex<Float>
operator()(
int parity,
int x,
int s_row,
int s_col,
int c_row,
int c_col)
const {
198 if (s_col / 2 != s_row / 2) {
return complex<Float>(0.0); }
202 int row = s_row%2 *
nColor + c_row;
203 int col = s_col%2 *
nColor + c_col;
204 Float *a_ = a+parity*offset_cb+stride*chirality*N*N;
207 return 2*a_[ row*stride+x ];
208 }
else if (col < row) {
210 int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
211 complex<Float> *off =
reinterpret_cast<complex<Float>*
>(a_ + N);
213 return 2*off[k*stride + x];
217 int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
218 complex<Float> *off =
reinterpret_cast<complex<Float>*
>(a_ + N);
219 return 2*
conj(off[k*stride + x]);
224 template<
typename helper,
typename reducer>
226 double result =
init;
229 thrust::device_ptr<complex<Float> > ptr(
reinterpret_cast<complex<Float>*
>(a));
230 result = thrust::transform_reduce(thrust::cuda::par(alloc), ptr, ptr+offset_cb, h, result, r);
233 complex<Float> *ptr =
reinterpret_cast<complex<Float>*
>(a);
234 result = thrust::transform_reduce(thrust::seq, ptr, ptr+offset_cb, h, result, r);
242 __device__ __host__
inline int indexFloatN(
int k,
int stride,
int x) {
245 return (j*stride+x)*N + i;
248 template<
typename Float,
int nColor,
int nSpin>
253 static constexpr
int N = nSpin *
nColor / 2;
255 : a(static_cast<Float*>(const_cast<void*>(A.
V(
inverse)))), stride(A.Stride()),
256 offset_cb(A.Bytes()/(2*sizeof(Float))) { }
258 __device__ __host__
inline complex<Float>
operator()(
int parity,
int x,
int s_row,
int s_col,
int c_row,
int c_col)
const {
260 if (s_col / 2 != s_row / 2) {
return complex<Float>(0.0); }
264 int row = s_row%2 *
nColor + c_row;
265 int col = s_col%2 *
nColor + c_col;
266 Float *a_ = a+parity*offset_cb+stride*chirality*N*N;
269 return 2*a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(row, stride, x) ];
270 }
else if (col < row) {
272 int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
275 return 2*complex<Float>(a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+0,stride,x) ],
276 a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+1,stride,x) ]);
280 int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
283 return 2*complex<Float>( a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+0,stride,x) ],
284 -a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+1,stride,x) ]);
289 template<
typename helper,
typename reducer>
291 double result =
init;
294 thrust::device_ptr<complex<Float> > ptr(
reinterpret_cast<complex<Float>*
>(a));
295 result = thrust::transform_reduce(thrust::cuda::par(alloc), ptr, ptr+offset_cb, h, result, r);
298 complex<Float> *ptr =
reinterpret_cast<complex<Float>*
>(a);
299 result = thrust::transform_reduce(thrust::seq, ptr, ptr+offset_cb, h, result, r);
306 template<
typename Float,
int nColor,
int nSpin>
313 a[0] =
static_cast<Float*
>(
const_cast<void*
>(A.
V(
inverse)));
315 a[1] =
static_cast<Float*
>(
const_cast<void*
>(A.
V(
inverse))) + A.
Bytes()/(2*
sizeof(Float));
316 zero = complex<Float>(0.0,0.0);
319 __device__ __host__
inline complex<Float>
operator()(
int parity,
int x,
int s_row,
int s_col,
int c_row,
int c_col)
const {
321 if (s_col / 2 != s_row / 2) {
return zero; }
325 unsigned int row = s_row%2 *
nColor + c_row;
326 unsigned int col = s_col%2 *
nColor + c_col;
331 }
else if (col < row) {
333 int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
334 int idx = (x*2 +
chirality)*N*N + N + 2*k;
335 return complex<Float>(a[
parity][idx], a[
parity][idx+1]);
338 int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
339 int idx = (x*2 +
chirality)*N*N + N + 2*k;
340 return complex<Float>(a[
parity][idx], -a[
parity][idx+1]);
344 template <
typename helper,
typename reducer>
388 template <
typename Float,
int nColor,
int nSpin, QudaCloverFieldOrder order>
405 : A(A), volumeCB(A.VolumeCB()), accessor(A,inverse), inverse(inverse), location(A.Location())
421 int s_col,
int c_row,
int c_col)
const {
422 return accessor(parity, x, s_row, s_col, c_row, c_col);
439 __device__ __host__
inline complex<Float>
operator()(
int dummy,
int parity,
int x,
int s_row,
440 int s_col,
int c_row,
int c_col)
const {
441 return accessor(parity,x,s_row,s_col,c_row,c_col);
473 constexpr
int n = (nSpin *
nColor) / 2;
474 constexpr
int chiral_block = n * n / 2;
475 return static_cast<size_t>(
volumeCB) * chiral_block * 2ll * 2ll *
sizeof(Float);
483 __host__
double norm1(
int dim=-1,
bool global=
true)
const {
485 thrust::plus<double>(), 0.0);
495 __host__
double norm2(
int dim=-1,
bool global=
true)
const {
497 thrust::plus<double>(), 0.0);
507 __host__
double abs_max(
int dim=-1,
bool global=
true)
const {
509 thrust::maximum<Float>(), 0.0);
519 __host__
double abs_min(
int dim=-1,
bool global=
true)
const {
521 thrust::minimum<Float>(), std::numeric_limits<double>::max());
540 template <
typename Float,
int length,
int N,
bool add_rho=false,
bool huge_alloc=false>
553 #ifdef USE_TEXTURE_OBJECTS 555 cudaTextureObject_t tex;
556 cudaTextureObject_t normTex;
557 const int tex_offset;
572 bool override =
false) :
573 offset(clover.Bytes() / (2 * sizeof(Float))),
574 norm_offset(clover.NormBytes() / (2 * sizeof(norm_type))),
575 #ifdef USE_TEXTURE_OBJECTS
578 tex_offset(offset / N),
580 volumeCB(clover.VolumeCB()),
581 stride(clover.Stride()),
582 twisted(clover.Twisted()),
585 bytes(clover.Bytes()),
586 norm_bytes(clover.NormBytes()),
588 backup_norm_h(nullptr)
590 this->clover = clover_ ? clover_ : (Float*)(clover.
V(is_inverse));
591 this->norm = norm_ ? norm_ : (norm_type *)(clover.
Norm(is_inverse));
592 #ifdef USE_TEXTURE_OBJECTS 601 if (!huge_alloc && (this->clover != clover.
V(is_inverse) ||
603 errorQuda(
"Cannot use texture read since data pointer does not equal field pointer - use with huge_alloc=true instead");
654 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__) 655 nrm = !huge_alloc ? tex1Dfetch_<float>(normTex, parity * norm_offset + chirality * stride + x) :
656 norm[parity * norm_offset + chirality * stride + x];
658 nrm = norm[parity * norm_offset + chirality * stride + x];
663 for (
int i=0; i<M; i++) {
664 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__) 667 TexVector vecTmp = tex1Dfetch_<TexVector>(tex, parity*tex_offset + stride*(chirality*M+i) + x);
670 for (
int j = 0; j < N; j++) {
671 copy(v[i * N + j], reinterpret_cast<real *>(&vecTmp)[j]);
678 Vector vecTmp = vector_load<Vector>(clover + parity*offset, x + stride*(chirality*M+i));
681 for (
int j = 0; j < N; j++) {
copy_and_scale(v[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm); }
685 if (add_rho)
for (
int i=0; i<6; i++) v[i] += rho;
701 norm_type scale = 0.0;
703 for (
int i = 0; i < block; i++) scale = fabsf((norm_type)v[i]) > scale ? fabsf((norm_type)v[i]) : scale;
704 norm[parity*norm_offset + chirality*stride + x] = scale;
712 for (
int i = 0; i < block; i++) tmp[i] = v[i] * scale_inv;
715 for (
int i = 0; i < block; i++) tmp[i] = v[i];
719 for (
int i = 0; i < M; i++) {
722 for (
int j = 0; j < N; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], tmp[i * N + j]);
724 vector_store(clover + parity*offset, x + stride*(chirality*M+i), vecTmp);
756 if (backup_h)
errorQuda(
"Already allocated host backup");
758 cudaMemcpy(backup_h, clover, bytes, cudaMemcpyDeviceToHost);
761 cudaMemcpy(backup_norm_h, norm, norm_bytes, cudaMemcpyDeviceToHost);
770 cudaMemcpy(clover, backup_h, bytes, cudaMemcpyHostToDevice);
774 cudaMemcpy(norm, backup_norm_h, norm_bytes, cudaMemcpyHostToDevice);
776 backup_norm_h =
nullptr;
782 size_t bytes =
length*
sizeof(Float);
794 template <
typename real,
int length>
struct S { real v[
length]; };
799 template <
typename Float,
int length>
811 : volumeCB(clover.VolumeCB()), stride(volumeCB), offset(clover.Bytes()/(2*sizeof(Float))),
812 twisted(clover.Twisted()), mu2(clover.Mu2()) {
813 this->clover = clover_ ? clover_ : (Float*)(clover.
V(inverse));
817 Float
Mu2()
const {
return mu2;}
821 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 823 trove::coalesced_ptr<structure> clover_((structure*)clover);
824 structure v_ = clover_[parity*volumeCB + x];
825 for (
int i=0; i<
length; i++) v[i] = 0.5*(RegType)v_.v[i];
827 for (
int i=0; i<
length; i++) v[i] = 0.5*clover[parity*offset + x*length+i];
832 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 834 trove::coalesced_ptr<structure> clover_((structure*)clover);
836 for (
int i=0; i<
length; i++) v_.v[i] = 2.0*(Float)v[i];
837 clover_[parity*volumeCB + x] = v_;
839 for (
int i=0; i<
length; i++) clover[parity*offset + x*length+i] = 2.0*v[i];
849 template <
typename Float,
int length>
861 : volumeCB(clover.VolumeCB()), stride(volumeCB), twisted(clover.Twisted()), mu2(clover.Mu2()) {
862 offdiag = clover_ ? ((Float**)clover_)[0] : ((Float**)clover.
V(inverse))[0];
863 diag = clover_ ? ((Float**)clover_)[1] : ((Float**)clover.
V(inverse))[1];
867 Float
Mu2()
const {
return mu2;}
873 for (
int i=0; i<6; i++) {
878 for (
int i=0; i<30; i++) {
881 const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
892 for (
int i=0; i<6; i++) {
897 for (
int i=0; i<30; i++) {
900 const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
916 template <
typename Float,
int length>
927 : volumeCB(clover.Stride()), stride(volumeCB), twisted(clover.Twisted()), mu2(clover.Mu2()) {
928 this->clover[0] = clover_ ? clover_ : (Float*)(clover.
V(inverse));
929 this->clover[1] = (Float*)((
char*)this->clover[0] + clover.
Bytes()/2);
934 Float
Mu2()
const {
return mu2;}
942 int bq[36] = { 21, 32, 33, 0, 1, 20,
943 28, 29, 30, 31, 6, 7, 14, 15, 22, 23,
944 34, 35, 8, 9, 16, 17, 24, 25,
945 10, 11, 18, 19, 26, 27,
951 for (
int i=0; i<6; i++) sign[i] = 1;
952 for (
int i=6; i<36; i+=2) {
953 if ( (i >= 10 && i<= 15) || (i >= 18 && i <= 29) ) { sign[i] = -1; sign[i+1] = -1; }
954 else { sign[i] = 1; sign[i+1] = -1; }
957 const int M=length/2;
959 for (
int i=0; i<M; i++)
975 template<
typename Float,
int N=72,
bool add_rho=false>
struct clover_mapper { };
991 #endif //_CLOVER_ORDER_H __device__ __host__ void load(real v[block], int x, int parity, int chirality) const
Load accessor for a single chiral block.
Accessor(const CloverField &A, bool inverse=false)
const AllocInt norm_offset
void * V(bool inverse=false)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double i) const
__device__ __host__ const complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function.
__device__ __host__ int Ncolor() const
Complex-member accessor function.
typename mapper< Float >::type real
mapper< Float >::type RegType
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
mapper< Float >::type RegType
__device__ __host__ clover_wrapper< real, Accessor > operator()(int x_cb, int parity, int chirality)
This accessor routine returns a clover_wrapper to this object, allowing us to overload various operat...
cudaColorSpinorField * tmp
__device__ __host__ const clover_wrapper< real, Accessor > operator()(int x_cb, int parity, int chirality) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
__device__ __host__ complex< Float > operator()(int dummy, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function. This is a special variant that is compatible with the equ...
__host__ __device__ void copy(T1 &a, const T2 &b)
void * backup_norm_h
host memory for backing up the field when tuning
const Accessor< Float, nColor, nSpin, order > accessor
This is just a dummy structure we use for trove to define the required structure size.
__device__ __host__ void load(real v[length], int x, int parity) const
Load accessor for the clover matrix.
__device__ __host__ void save(RegType v[length], int x, int parity)
FieldOrder(CloverField &A, bool inverse=false)
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
void comm_allreduce_min(double *data)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
__device__ __host__ int VolumeCB() const
const QudaFieldLocation location
__device__ __host__ HMatrix()
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
Accessor(const CloverField &A, bool inverse=false)
AllocType< huge_alloc >::type AllocInt
clover::FloatNOrder< char, N, 4, add_rho > type
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field.
clover::FloatNOrder< short, N, 4, add_rho > type
clover_wrapper is an internal class that is used to wrap instances of colorspinor accessors...
void load()
Restore the field from the host after tuning.
__device__ __host__ void save(const RegType v[length], int x, int parity)
__device__ __host__ int Volume() const
__device__ __host__ void load(RegType v[length], int x, int parity) const
Provides precision abstractions and defines the register precision given the storage precision using ...
__host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double init) const
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm suared of the field.
Accessor(const CloverField &A, bool inverse=false)
void init()
Create the CUBLAS context.
#define safe_malloc(size)
clover::FloatNOrder< double, N, 2, add_rho > type
QDPOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__device__ __host__ void save(const real v[length], int x, int parity)
Store accessor for the clover matrix.
QudaFieldLocation Location() const
__device__ __host__ void save(const real v[block], int x, int parity, int chirality)
Store accessor for a single chiral block.
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field.
enum QudaFieldLocation_s QudaFieldLocation
mapper< Float >::type RegType
__device__ __host__ void operator=(const C &a)
Assignment operator with H matrix instance as input.
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
clover::FloatNOrder< float, N, 4, add_rho > type
void * Norm(bool inverse=false)
VectorType< Float, N >::type Vector
FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, norm_type *norm_=0, bool override=false)
host memory for backing up norm when tuning
__device__ __host__ void operator=(const HMatrix< U, N > &b)
__device__ __host__ void save(const RegType v[length], int x, int parity)
Accessor routine for CloverFields in native field order.
QDPJITOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double init) const
__device__ __host__ complex< Float > & operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ __device__ ValueType abs(ValueType x)
void comm_allreduce(double *data)
__host__ __device__ ValueType conj(ValueType x)
void comm_allreduce_max(double *data)
BQCDOrder(const CloverField &clover, bool inverse, Float *clover_=0)
Accessor(const CloverField &A, bool inverse=false)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
QudaPrecision Precision() const
__device__ __host__ void load(RegType v[length], int x, int parity) const
__device__ __host__ void zero(vector_type< scalar, n > &v)
__device__ __host__ int indexFloatN(int k, int stride, int x)
__device__ __host__ void load(RegType v[length], int x, int parity) const
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ double transform_reduce(QudaFieldLocation location, helper h, reducer r, double init) const
void save()
Backup the field to the host when tuning.
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ __device__ void copy_and_scale(T1 &a, const T2 &b, const T3 &c)
Specialized variants of the copy function that include an additional scale factor. Note the scale factor is ignored unless the input type (b) is either a short or char vector.