1 #ifndef _CLOVER_ORDER_H
2 #define _CLOVER_ORDER_H
16 #include <trove_helper.cuh>
33 template <
typename Float,
typename T>
55 __device__ __host__
inline void operator=(
const C &a) {
60 template <
typename T,
int N>
66 template <
typename T,
int N>
74 template<
typename ReduceType,
typename Float>
struct square_ {
76 {
return static_cast<ReduceType
>(
norm(x)); }
79 template<
typename ReduceType,
typename Float>
struct abs_ {
81 {
return static_cast<ReduceType
>(
abs(x)); }
168 template<
typename Float,
int nColor,
int nSpin, QudaCloverFieldOrder order>
struct Accessor {
171 errorQuda(
"Not implemented for order %d", order);
175 int c_row,
int c_col)
const {
179 template <
typename helper,
typename reducer>
186 template<
typename Float,
int nColor,
int nSpin>
191 static constexpr
int N = nSpin *
nColor / 2;
193 : a(static_cast<
Float*>(const_cast<void*>(A.
V(
inverse)))), stride(A.Stride()),
194 offset_cb(A.Bytes()/(2*sizeof(
Float))) { }
200 const int chirality = s_col / 2;
202 int row = s_row%2 *
nColor + c_row;
203 int col = s_col%2 *
nColor + c_col;
204 Float *a_ = a+
parity*offset_cb+stride*chirality*N*N;
207 return 2*a_[ row*stride+x ];
208 }
else if (col < row) {
210 int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
213 return 2*off[k*stride + x];
217 int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
219 return 2*
conj(off[k*stride + x]);
224 template <
typename helper,
typename reducer>
234 __device__ __host__
inline int indexFloatN(
int k,
int stride,
int x) {
237 return (j*stride+x)*N + i;
240 template<
typename Float,
int nColor,
int nSpin>
245 static constexpr
int N = nSpin *
nColor / 2;
247 : a(static_cast<
Float*>(const_cast<void*>(A.
V(
inverse)))), stride(A.Stride()),
248 offset_cb(A.Bytes()/(2*sizeof(
Float))) { }
254 const int chirality = s_col / 2;
256 int row = s_row%2 *
nColor + c_row;
257 int col = s_col%2 *
nColor + c_col;
258 Float *a_ = a+
parity*offset_cb+stride*chirality*N*N;
261 return 2*a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(row, stride, x) ];
262 }
else if (col < row) {
264 int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
267 return 2*
complex<Float>(a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+0,stride,x) ],
268 a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+1,stride,x) ]);
272 int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
275 return 2*
complex<Float>( a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+0,stride,x) ],
276 -a_[ indexFloatN<QUDA_FLOAT4_CLOVER_ORDER>(idx+1,stride,x) ]);
281 template <
typename helper,
typename reducer>
290 template<
typename Float,
int nColor,
int nSpin>
305 if (s_col / 2 != s_row / 2) {
return zero; }
307 const int chirality = s_col / 2;
309 unsigned int row = s_row%2 *
nColor + c_row;
310 unsigned int col = s_col%2 *
nColor + c_col;
315 }
else if (col < row) {
317 int k = N*(N-1)/2 - (N-col)*(N-col-1)/2 + row - col - 1;
318 int idx = (x*2 + chirality)*N*N + N + 2*k;
322 int k = N*(N-1)/2 - (N-row)*(N-row-1)/2 + col - row - 1;
323 int idx = (x*2 + chirality)*N*N + N + 2*k;
328 template <
typename helper,
typename reducer>
372 template <
typename Float,
int nColor,
int nSpin, QudaCloverFieldOrder order>
408 int s_col,
int c_row,
int c_col)
const {
427 int s_col,
int c_row,
int c_col)
const {
460 constexpr
int n = (nSpin *
nColor) / 2;
461 constexpr
int chiral_block = n * n / 2;
462 return static_cast<size_t>(
volumeCB) * chiral_block * 2ll * 2ll *
sizeof(
Float);
470 __host__
double norm1(
int dim=-1,
bool global=
true)
const {
481 __host__
double norm2(
int dim=-1,
bool global=
true)
const {
492 __host__
double abs_max(
int dim=-1,
bool global=
true)
const {
503 __host__
double abs_min(
int dim=-1,
bool global=
true)
const {
523 template <
typename Float,
int length,
int N,
bool add_rho=false,
bool huge_alloc=false>
549 bool override =
false) :
562 if (
clover.Order() != N) {
563 errorQuda(
"Invalid clover order %d for FloatN (N=%d) accessor",
clover.Order(), N);
565 this->clover = clover_ ? clover_ : (
Float *)(
clover.V(is_inverse));
598 int x_cb,
int parity,
int chirality)
const
616 for (
int i=0; i<
M; i++) {
621 for (
int j = 0; j < N; j++) {
copy_and_scale(v[i * N + j],
reinterpret_cast<Float *
>(&vecTmp)[j], nrm); }
624 if (add_rho)
for (
int i=0; i<6; i++) v[i] +=
rho;
651 for (
int i = 0; i <
block; i++)
tmp[i] = v[i] * scale_inv;
654 for (
int i = 0; i <
block; i++)
tmp[i] = v[i];
658 for (
int i = 0; i <
M; i++) {
661 for (
int j = 0; j < N; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j],
tmp[i * N + j]);
676 for (
int chirality = 0; chirality < 2; chirality++)
load(&v[chirality *
block], x,
parity, chirality);
688 for (
int chirality = 0; chirality < 2; chirality++)
save(&v[chirality *
block], x,
parity, chirality);
733 template <
typename real,
int length>
struct S { real
v[
length]; };
738 template <
typename Float,
int length>
753 errorQuda(
"Invalid clover order %d for this accessor",
clover.Order());
763 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
774 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
778 for (
int i=0; i<
length; i++) v_.v[i] = 2.0*(
Float)v[i];
791 template <
typename Float,
int length>
805 errorQuda(
"Invalid clover order %d for this accessor", clover.
Order());
816 for (
int chirality=0; chirality<2; chirality++) {
818 for (
int i=0; i<6; i++) {
823 for (
int i=0; i<30; i++) {
826 const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
827 v[chirality*36 + 6 + i] = 0.5*
offdiag[(((z*15 + idtab[off])*2 + chirality)*2 +
parity)*
volumeCB + x];
835 for (
int chirality=0; chirality<2; chirality++) {
837 for (
int i=0; i<6; i++) {
842 for (
int i=0; i<30; i++) {
845 const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
846 offdiag[(((z*15 + idtab[off])*2 + chirality)*2 +
parity)*
volumeCB + x] = 2.0*v[chirality*36 + 6 + i];
861 template <
typename Float,
int length>
874 errorQuda(
"Invalid clover order %d for this accessor",
clover.Order());
877 this->clover[1] = (
Float *)((
char *)this->clover[0] +
clover.Bytes() / 2);
890 int bq[36] = { 21, 32, 33, 0, 1, 20,
891 28, 29, 30, 31, 6, 7, 14, 15, 22, 23,
892 34, 35, 8, 9, 16, 17, 24, 25,
893 10, 11, 18, 19, 26, 27,
899 for (
int i=0; i<6; i++) sign[i] = 1;
900 for (
int i=6; i<36; i+=2) {
901 if ( (i >= 10 && i<= 15) || (i >= 18 && i <= 29) ) { sign[i] = -1; sign[i+1] = -1; }
902 else { sign[i] = 1; sign[i+1] = -1; }
906 for (
int chirality=0; chirality<2; chirality++)
907 for (
int i=0; i<M; i++)
923 template<
typename Float,
int N=72,
bool add_rho=false>
struct clover_mapper { };
QudaCloverFieldOrder Order() const
void * V(bool inverse=false)
__device__ __host__ void operator=(const HMatrix< U, N > &b)
__device__ __host__ HMatrix()
void comm_allreduce_min(double *data)
void comm_allreduce_max(double *data)
void comm_allreduce(double *data)
cudaColorSpinorField * tmp
@ QUDA_FLOAT4_CLOVER_ORDER
@ QUDA_FLOAT2_CLOVER_ORDER
@ QUDA_QDPJIT_CLOVER_ORDER
@ QUDA_PACKED_CLOVER_ORDER
enum QudaFieldLocation_s QudaFieldLocation
#define safe_malloc(size)
void init()
Create the BLAS context.
__device__ __host__ int indexFloatN(int k, int stride, int x)
__host__ __device__ ValueType conj(ValueType x)
void transform_reduce(Arg &arg)
__device__ __host__ void zero(double &a)
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ __device__ std::enable_if<!isFixed< T2 >::value, void >::type copy_and_scale(T1 &a, const T2 &b, const T3 &c)
Specialized variants of the copy function that include an additional scale factor....
__host__ __device__ ValueType abs(ValueType x)
FloatingPoint< float > Float
#define qudaMemcpy(dst, src, count, kind)
Provides precision abstractions and defines the register precision given the storage precision using ...
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ double transform_reduce(QudaFieldLocation location, helper h, double init, reducer r) const
Accessor(const CloverField &A, bool inverse=false)
Accessor(const CloverField &A, bool inverse=false)
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ double transform_reduce(QudaFieldLocation location, helper h, double init, reducer r) const
__host__ double transform_reduce(QudaFieldLocation location, helper h, double init, reducer r) const
__device__ __host__ complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Accessor(const CloverField &A, bool inverse=false)
__device__ __host__ complex< Float > & operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Accessor(const CloverField &A, bool inverse=false)
__host__ double transform_reduce(QudaFieldLocation location, helper h, double i, reducer r) const
__device__ __host__ void load(RegType v[length], int x, int parity) const
mapper< Float >::type RegType
__device__ __host__ void save(RegType v[length], int x, int parity)
BQCDOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__device__ __host__ complex< Float > operator()(int dummy, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function. This is a special variant that is compatible with the equ...
__device__ __host__ int Ncolor() const
Complex-member accessor function.
FieldOrder(CloverField &A, bool inverse=false)
const QudaFieldLocation location
__device__ __host__ int VolumeCB() const
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field.
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm suared of the field.
__device__ __host__ int Volume() const
__device__ __host__ const complex< Float > operator()(int parity, int x, int s_row, int s_col, int c_row, int c_col) const
Read-only complex-member accessor function.
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field.
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
static constexpr bool supports_ghost_zone
const Accessor< Float, nColor, nSpin, order > accessor
Accessor routine for CloverFields in native field order.
__device__ __host__ void load(real v[block], int x, int parity, int chirality) const
Load accessor for a single chiral block.
__device__ __host__ clover_wrapper< real, Accessor > operator()(int x_cb, int parity, int chirality)
This accessor routine returns a clover_wrapper to this object, allowing us to overload various operat...
const AllocInt norm_offset
__device__ __host__ void save(const real v[length], int x, int parity)
Store accessor for the clover matrix.
void save()
Backup the field to the host when tuning.
__device__ __host__ void save(const real v[block], int x, int parity, int chirality)
Store accessor for a single chiral block.
__device__ __host__ const clover_wrapper< real, Accessor > operator()(int x_cb, int parity, int chirality) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
typename mapper< Float >::type real
AllocType< huge_alloc >::type AllocInt
FloatNOrder(const CloverField &clover, bool is_inverse, Float *clover_=0, norm_type *norm_=0, bool override=false)
host memory for backing up norm when tuning
void load()
Restore the field from the host after tuning.
VectorType< Float, N >::type Vector
__device__ __host__ void load(real v[length], int x, int parity) const
Load accessor for the clover matrix.
void * backup_norm_h
host memory for backing up the field when tuning
QDPJITOrder(const CloverField &clover, bool inverse, Float *clover_=0)
__device__ __host__ void load(RegType v[length], int x, int parity) const
__device__ __host__ void save(const RegType v[length], int x, int parity)
mapper< Float >::type RegType
QDPOrder(const CloverField &clover, bool inverse, Float *clover_=0)
mapper< Float >::type RegType
__device__ __host__ void load(RegType v[length], int x, int parity) const
__device__ __host__ void save(const RegType v[length], int x, int parity)
This is just a dummy structure we use for trove to define the required structure size.
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
clover::FloatNOrder< double, N, 2, add_rho > type
clover::FloatNOrder< float, N, 4, add_rho > type
clover::FloatNOrder< int8_t, N, 4, add_rho > type
clover::FloatNOrder< short, N, 4, add_rho > type
clover_wrapper is an internal class that is used to wrap instances of colorspinor accessors,...
__device__ __host__ void operator=(const C &a)
Assignment operator with H matrix instance as input.