1 #ifndef _COLOR_SPINOR_ORDER_H
2 #define _COLOR_SPINOR_ORDER_H
19 #include <index_helper.cuh>
22 #include <trove_helper.cuh>
39 template <
typename Float,
typename T>
65 template <
typename T,
int Nc,
int Ns>
71 template <
typename T,
int Nc,
int Ns>
77 template <
typename T,
int Nc>
83 template <
typename T,
int Nc>
86 a.field.load(data, a.x_cb, a.parity);
89 template <
typename T,
int Nc>
92 a.field.load(data, a.x_cb, a.parity);
95 template <
typename T,
int Nc>
98 a.field.load(data, a.x_cb, a.parity);
113 template <
typename Float,
typename T>
149 template <
typename T,
int Nc,
int Ns>
150 template <
typename S>
155 template <
typename T,
int Nc,
int Ns>
156 template <
typename S>
161 template <
typename T,
int Nc>
162 template <
typename S>
168 template <
typename T,
int Nc>
169 template <
typename S>
172 a.field.loadGhost(data, a.ghost_idx, a.dim, a.dir, a.parity);
175 template <
typename T,
int Nc>
176 template <
typename S>
178 a.field.loadGhost(data, a.ghost_idx, a.dim, a.dir, a.parity);
181 template <
typename T,
int Nc>
182 template <
typename S>
184 a.field.loadGhost(data, a.ghost_idx, a.dim, a.dir, a.parity);
187 namespace colorspinor {
189 template<
typename ReduceType,
typename Float>
struct square_ {
192 {
return static_cast<ReduceType
>(
norm(x)); }
195 template<
typename ReduceType>
struct square_<ReduceType,short> {
202 template <
typename ReduceType>
struct square_<ReduceType, int8_t> {
209 template<
typename Float,
typename storeFloat>
struct abs_ {
228 template <
typename Float,
int nSpin,
int nColor,
int nVec, QudaFieldOrder order>
struct AccessorCB {
231 __device__ __host__
inline int index(
int parity,
int x_cb,
int s,
int c,
int v)
const {
return 0; }
234 template<
typename Float,
int nSpin,
int nColor,
int nVec, QudaFieldOrder order>
struct GhostAccessorCB {
237 __device__ __host__
inline int index(
int dim,
int dir,
int parity,
int x_cb,
int s,
int c,
int v)
const
241 template <
typename Float,
int nSpin,
int nColor,
int nVec>
246 __device__ __host__
inline int index(
int parity,
int x_cb,
int s,
int c,
int v)
const
248 return parity * offset_cb + ((x_cb * nSpin + s) *
nColor + c) * nVec + v;
261 return parity * offset_cb + (x_cb * nSpin + s) *
nColor * nVec;
265 template<
typename Float,
int nSpin,
int nColor,
int nVec>
270 for (
int d=0; d<4; d++) {
272 ghostOffset[d] = faceVolumeCB[d]*
nColor*nSpin*nVec;
276 __device__ __host__
inline int index(
int dim,
int dir,
int parity,
int x_cb,
int s,
int c,
int v)
const
288 template <
int nSpin,
int nColor,
int nVec,
int N>
289 __device__ __host__
inline int indexFloatN(
int x_cb,
int s,
int c,
int v,
int stride)
291 int k = (s *
nColor + c) * nVec + v;
294 return (j * stride + x_cb) * (N / 2) + i;
297 template <
typename Float,
int nSpin,
int nColor,
int nVec>
302 stride(field.Stride()),
303 offset_cb((field.Bytes() >> 1) / sizeof(
complex<
Float>))
307 __device__ __host__
inline int index(
int parity,
int x_cb,
int s,
int c,
int v)
const
309 return parity * offset_cb + ((s *
nColor + c) * nVec + v) * stride + x_cb;
312 template <
int nSpinBlock>
314 int parity,
int x_cb,
int chi)
const
317 constexpr
int M = nSpinBlock *
nColor * nVec;
319 for (
int i = 0; i < M; i++) {
320 vec_t
tmp = vector_load<vec_t>(
reinterpret_cast<const vec_t *
>(in +
parity * offset_cb),
321 (chi * M + i) * stride + x_cb);
322 memcpy(&out[i], &
tmp,
sizeof(vec_t));
327 template<
typename Float,
int nSpin,
int nColor,
int nVec>
332 for (
int d=0; d<4; d++) {
334 ghostOffset[d] = faceVolumeCB[d]*
nColor*nSpin*nVec;
338 __device__ __host__
inline int index(
int dim,
int dir,
int parity,
int x_cb,
int s,
int c,
int v)
const
342 template <
typename Float,
int nSpin,
int nColor,
int nVec>
347 stride(field.Stride()),
348 offset_cb((field.Bytes() >> 1) / sizeof(
complex<
Float>))
352 __device__ __host__
inline int index(
int parity,
int x_cb,
int s,
int c,
int v)
const
354 return parity * offset_cb + indexFloatN<nSpin, nColor, nVec, 4>(x_cb, s, c, v, stride);
357 template <
int nSpinBlock>
359 int parity,
int x_cb,
int chi)
const
362 constexpr
int M = (nSpinBlock *
nColor * nVec * 2) / 4;
364 for (
int i = 0; i < M; i++) {
365 vec_t
tmp = vector_load<vec_t>(
reinterpret_cast<const vec_t *
>(in +
parity * offset_cb),
366 (chi * M + i) * stride + x_cb);
367 memcpy(&out[i * 2], &
tmp,
sizeof(vec_t));
372 template<
typename Float,
int nSpin,
int nColor,
int nVec>
377 for (
int d = 0; d < 4; d++) {
378 faceVolumeCB[d] = nFace * a.
SurfaceCB(d);
379 ghostOffset[d] = faceVolumeCB[d] *
nColor * nSpin * nVec;
383 __device__ __host__
inline int index(
int dim,
int dir,
int parity,
int x_cb,
int s,
int c,
int v)
const
384 {
return parity*ghostOffset[
dim] + indexFloatN<nSpin,nColor,nVec,4>(x_cb, s, c, v, faceVolumeCB[
dim]); }
387 template <
typename Float,
int nSpin,
int nColor,
int nVec>
392 stride(field.Stride()),
393 offset_cb((field.Bytes() >> 1) / sizeof(
complex<
Float>))
397 __device__ __host__
inline int index(
int parity,
int x_cb,
int s,
int c,
int v)
const
399 return parity * offset_cb + indexFloatN<nSpin, nColor, nVec, 8>(x_cb, s, c, v, stride);
402 template <
int nSpinBlock>
404 int parity,
int x_cb,
int chi)
const
410 constexpr
int N = nSpin *
nColor * nVec * 2;
411 constexpr
int M = N / 8;
414 for (
int i = 0; i < M; i++) {
415 vec_t ld_tmp = vector_load<vec_t>(
reinterpret_cast<const vec_t *
>(in +
parity * offset_cb), i * stride + x_cb);
416 memcpy(&
tmp[i * 8], &ld_tmp,
sizeof(vec_t));
418 constexpr
int N_chi = N / (nSpin / nSpinBlock);
420 for (
int i = 0; i < N_chi; i++)
425 template <
typename Float,
int nSpin,
int nColor,
int nVec>
431 for (
int d = 0; d < 4; d++) {
432 faceVolumeCB[d] = nFace * a.
SurfaceCB(d);
433 ghostOffset[d] = faceVolumeCB[d] *
nColor * nSpin * nVec;
437 __device__ __host__
inline int index(
int dim,
int dir,
int parity,
int x_cb,
int s,
int c,
int v)
const
439 return parity * ghostOffset[
dim] + indexFloatN<nSpin, nColor, nVec, 8>(x_cb, s, c, v, faceVolumeCB[
dim]);
443 template <
typename Float,
typename storeFloat> __host__ __device__
inline constexpr
bool fixed_point() {
return false; }
448 template <
typename Float,
typename storeFloat> __host__ __device__
inline constexpr
bool match() {
return false; }
450 template<> __host__ __device__
inline constexpr
bool match<int,int>() {
return true; }
460 template <
typename Float,
typename storeFloat>
471 static constexpr
bool fixed = fixed_point<Float, storeFloat>();
519 __device__ __host__
inline auto data() {
return &
v[
idx]; }
521 __device__ __host__
inline const auto data()
const {
return &
v[
idx]; }
543 template<
typename theirFloat>
545 if (match<storeFloat,theirFloat>()) {
556 template<
typename theirFloat>
563 template<
typename theirFloat>
565 if (match<storeFloat,theirFloat>()) {
576 template<
typename theirFloat>
578 if (match<storeFloat,theirFloat>()) {
588 typename ghostFloat = storeFloat,
bool disable_ghost =
false,
bool block_float =
false>
591 typedef float norm_type;
609 #ifndef DISABLE_GHOST
623 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
624 static constexpr
bool ghost_fixed = fixed_point<Float,ghostFloat>();
633 :
v(v_? static_cast<
complex<storeFloat>*>(const_cast<void*>(v_))
634 : static_cast<
complex<storeFloat>*>(const_cast<void*>(field.
V()))),
636 #ifndef DISABLE_GHOST
643 #ifndef DISABLE_GHOST
650 if (!disable_ghost)
errorQuda(
"DISABLE_GHOST macro set but corresponding disable_ghost template not set");
655 norm =
static_cast<norm_type *
>(
const_cast<void *
>(field.
Norm()));
660 #ifndef DISABLE_GHOST
664 for (
int dir=0; dir<2; dir++) {
668 reinterpret_cast<norm_type *
>(
static_cast<char *
>(ghost_[2 *
dim + dir])
670 *
sizeof(ghostFloat));
678 scale =
static_cast<Float>(std::numeric_limits<storeFloat>::max() / max);
679 scale_inv =
static_cast<Float>(max / std::numeric_limits<storeFloat>::max());
681 #ifndef DISABLE_GHOST
684 errorQuda(
"Block-float accessor requires max=1.0 not max=%e\n", max);
685 ghost_scale =
static_cast<Float>(std::numeric_limits<ghostFloat>::max() / max);
700 template <
int nSpinBlock>
710 for (
int s = 0; s < nSpinBlock; s++) {
711 for (
int c = 0; c <
nColor; c++) {
712 for (
int v = 0;
v < nVec;
v++) {
713 int k = (s *
nColor + c) * nVec +
v;
732 #if (__CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 520)
773 __device__ __host__
inline const auto wrap(
int parity,
int x_cb,
int s)
const
781 __device__ __host__
inline auto wrap(
int parity,
int x_cb,
int s)
786 #ifndef DISABLE_GHOST
798 #if (__CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 520)
876 for (
int d=0; d<
nDim; d++) {
908 for (
int d=
nDim-1; d>=0; d--) i =
x[d]*i + y[d];
918 __device__ __host__
inline int X(
int d)
const {
return x[d]; }
921 __device__ __host__
inline const int*
X()
const {
return x; }
928 __device__ __host__
inline int Nspin()
const {
return nSpin; }
931 __device__ __host__
inline int Nvec()
const {
return nVec; }
933 #ifndef DISABLE_GHOST
941 __device__ __host__
inline int Ndim()
const {
return nDim; }
951 __host__
double norm2(
bool global =
true)
const
964 __host__
double abs_max(
bool global =
true)
const
987 template <
typename Float,
int Ns,
int Nc,
int N_,
bool spin_project = false,
bool huge_alloc = false>
989 static_assert((2 * Ns * Nc) % N_ == 0,
"Internal degrees of freedom not divisible by short-vector length");
990 static constexpr
int length = 2 * Ns * Nc;
992 static constexpr
int N = N_;
995 static constexpr
int N_ghost = !spin_project ?
N : (Ns * Nc) %
N == 0 ?
N :
N / 2;
1018 Float **ghost_ = 0,
bool override =
false) :
1036 for (
int dir = 0; dir < 2; dir++) {
1040 reinterpret_cast<norm_type *
>(
static_cast<char *
>(ghost_[2 *
dim + dir])
1053 for (
int i=0; i<
M; i++) {
1058 for (
int j = 0; j < N; j++) copy_and_scale(v[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm);
1062 for (
int i = 0; i <
length / 2; i++) out[i] =
complex(v[2 * i + 0], v[2 * i + 1]);
1070 for (
int i = 0; i <
length / 2; i++) {
1071 v[2 * i + 0] = in[i].real();
1072 v[2 * i + 1] = in[i].imag();
1082 for (
int i = 0; i <
length / 2; i++) scale = fmaxf(max_[i], scale);
1085 #ifdef __CUDA_ARCH__
1091 for (
int i = 0; i <
length; i++) v[i] = v[i] * scale_inv;
1095 for (
int i=0; i<
M; i++) {
1099 for (
int j = 0; j < N; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], v[i *
N + j]);
1140 for (
int i = 0; i <
M_ghost; i++) {
1144 for (
int j = 0; j < N_ghost; j++) copy_and_scale(v[i * N_ghost + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm);
1157 v[2 * i + 0] = in[i].real();
1158 v[2 * i + 1] = in[i].imag();
1170 for (
int i = 0; i <
length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
1173 #ifdef __CUDA_ARCH__
1179 for (
int i = 0; i <
length_ghost; i++) v[i] = v[i] * scale_inv;
1183 for (
int i = 0; i <
M_ghost; i++) {
1187 for (
int j = 0; j < N_ghost; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], v[i *
N_ghost + j]);
1255 template <
typename real,
int length>
struct S { real
v[
length]; };
1257 template <
typename Float,
int Ns,
int Nc>
1275 for (
int i=0; i<4; i++) {
1276 ghost[2*i] = ghost_ ? ghost_[2*i] : 0;
1277 ghost[2*i+1] = ghost_ ? ghost_[2*i+1] : 0;
1284 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1288 for (
int s=0; s<Ns; s++) {
1289 for (
int c = 0; c < Nc; c++) { v[s * Nc + c] =
complex(v_.v[(c * Ns + s) * 2 + 0], v_.v[(c * Ns + s) * 2 + 1]); }
1292 for (
int s=0; s<Ns; s++) {
1293 for (
int c=0; c<Nc; c++) {
1303 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1307 for (
int s=0; s<Ns; s++) {
1308 for (
int c=0; c<Nc; c++) {
1309 v_.v[(c*Ns + s)*2 + 0] = (
Float)v[s*Nc+c].real();
1310 v_.v[(c*Ns + s)*2 + 1] = (
Float)v[s*Nc+c].imag();
1315 for (
int s=0; s<Ns; s++) {
1316 for (
int c=0; c<Nc; c++) {
1354 for (
int s=0; s<Ns; s++) {
1355 for (
int c=0; c<Nc; c++) {
1364 for (
int s=0; s<Ns; s++) {
1365 for (
int c=0; c<Nc; c++) {
1375 template <
typename Float,
int Ns,
int Nc>
1393 for (
int i=0; i<4; i++) {
1394 ghost[2*i] = ghost_ ? ghost_[2*i] : 0;
1395 ghost[2*i+1] = ghost_ ? ghost_[2*i+1] : 0;
1402 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1406 for (
int s=0; s<Ns; s++) {
1407 for (
int c = 0; c < Nc; c++) { v[s * Nc + c] =
complex(v_.v[(s * Nc + c) * 2 + 0], v_.v[(s * Nc + c) * 2 + 1]); }
1410 for (
int s=0; s<Ns; s++) {
1411 for (
int c=0; c<Nc; c++) {
1421 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1425 for (
int s=0; s<Ns; s++) {
1426 for (
int c=0; c<Nc; c++) {
1427 v_.v[(s * Nc + c) * 2 + 0] = v[s * Nc + c].
real();
1428 v_.v[(s * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1433 for (
int s=0; s<Ns; s++) {
1434 for (
int c=0; c<Nc; c++) {
1435 field[
parity *
offset + ((x * Ns + s) * Nc + c) * 2 + 0] = v[s * Nc + c].real();
1436 field[
parity *
offset + ((x * Ns + s) * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1472 for (
int s=0; s<Ns; s++) {
1473 for (
int c=0; c<Nc; c++) {
1482 for (
int s=0; s<Ns; s++) {
1483 for (
int c=0; c<Nc; c++) {
1494 template <
typename Float,
int Ns,
int Nc>
1513 dim{ a.X(0), a.X(1), a.X(2), a.X(3)},
exDim{ a.X(0), a.X(1), a.X(2) + 4, a.X(3)}
1516 for (
int i=0; i<4; i++) {
1517 ghost[2*i] = ghost_ ? ghost_[2*i] : 0;
1518 ghost[2*i+1] = ghost_ ? ghost_[2*i+1] : 0;
1540 return linkIndex(coord,
exDim);
1547 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1551 for (
int s=0; s<Ns; s++) {
1552 for (
int c = 0; c < Nc; c++) { v[s * Nc + c] =
complex(v_.v[(s * Nc + c) * 2 + 0], v_.v[(s * Nc + c) * 2 + 1]); }
1555 for (
int s=0; s<Ns; s++) {
1556 for (
int c=0; c<Nc; c++) {
1568 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
1572 for (
int s=0; s<Ns; s++) {
1573 for (
int c=0; c<Nc; c++) {
1574 v_.v[(s * Nc + c) * 2 + 0] = v[s * Nc + c].
real();
1575 v_.v[(s * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1580 for (
int s=0; s<Ns; s++) {
1581 for (
int c=0; c<Nc; c++) {
1582 field[
parity *
offset + ((y * Ns + s) * Nc + c) * 2 + 0] = v[s * Nc + c].real();
1583 field[
parity *
offset + ((y * Ns + s) * Nc + c) * 2 + 1] = v[s * Nc + c].imag();
1619 for (
int s=0; s<Ns; s++) {
1620 for (
int c=0; c<Nc; c++) {
1629 for (
int s=0; s<Ns; s++) {
1630 for (
int c=0; c<Nc; c++) {
1641 template <
typename Float,
int Ns,
int Nc>
1658 for (
int s=0; s<Ns; s++) {
1659 for (
int c=0; c<Nc; c++) {
1668 for (
int s=0; s<Ns; s++) {
1669 for (
int c=0; c<Nc; c++) {
1670 field[(((0 * Nc + c) * Ns + s) * 2 + (1 -
parity)) *
volumeCB + x] = v[s * Nc + c].real();
1671 field[(((1 * Nc + c) * Ns + s) * 2 + (1 -
parity)) *
volumeCB + x] = v[s * Nc + c].imag();
1709 template <
typename otherFloat,
typename storeFloat>
1715 template <
typename otherFloat,
typename storeFloat>
1721 template <
typename otherFloat,
typename storeFloat>
1727 template <
typename otherFloat,
typename storeFloat>
1734 template <
typename T,
int Ns,
int Nc,
bool project = false,
bool huge_alloc = false>
struct colorspinor_mapper {
void * Ghost(const int i)
const int * SurfaceCB() const
complex< storeFloat > * v
__device__ __host__ int Nparity() const
FieldOrderCB(const ColorSpinorField &field, int nFace=1, void *v_=0, void **ghost_=0)
__device__ __host__ const auto wrap_ghost(int dim, int dir, int parity, int x_cb, int s) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
__host__ double norm2(bool global=true) const
__device__ __host__ const complex< Float > Ghost(int dim, int dir, int parity, int x_cb, int s, int c, int n=0) const
complex< ghostFloat > * ghost[8]
__device__ __host__ fieldorder_wrapper< Float, ghostFloat > Ghost(int dim, int dir, int parity, int x_cb, int s, int c, int n=0, Float max=0)
const QudaFieldLocation location
const GhostAccessorCB< ghostFloat, nSpin, nColor, nVec, order > ghostAccessor
void resetScale(Float max)
__device__ __host__ QudaGammaBasis GammaBasis() const
__host__ double abs_max(bool global=true) const
__device__ __host__ int Ncolor() const
__device__ __host__ auto wrap_ghost(int dim, int dir, int parity, int x_cb, int s)
the non-const wrap_ghost method
__device__ __host__ const complex< Float > operator()(int parity, int x_cb, int s, int c, int n=0) const
__device__ __host__ int VolumeCB() const
__device__ __host__ int X(int d) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int parity, int x_cb, int s, int c, int n=0)
__device__ __host__ void load(complex< Float > out[nSpinBlock *nColor *nVec], int parity, int x_cb, int chi) const
static constexpr bool fixed
__device__ __host__ void LatticeIndex(int y[QUDA_MAX_DIM], int i) const
__device__ __host__ auto wrap(int parity, int x_cb, int s)
static constexpr bool ghost_fixed
__device__ __host__ const int * X() const
norm_type * ghost_norm[8]
__device__ __host__ int Nvec() const
static constexpr bool supports_ghost_zone
const AccessorCB< storeFloat, nSpin, nColor, nVec, order > accessor
__device__ __host__ const auto wrap(int parity, int x_cb, int s) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
__device__ __host__ int Ndim() const
__device__ __host__ int Nspin() const
__device__ __host__ void OffsetIndex(int &i, int y[QUDA_MAX_DIM]) const
void resetGhost(const ColorSpinorField &a, void *const *ghost_) const
static constexpr bool block_float_ghost
const QudaGammaBasis gammaBasis
int comm_dim_partitioned(int dim)
void comm_allreduce_max(double *data)
void comm_allreduce(double *data)
cudaColorSpinorField * tmp
enum QudaFieldOrder_s QudaFieldOrder
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_FLOAT2_FIELD_ORDER
@ QUDA_SPACE_COLOR_SPIN_FIELD_ORDER
@ QUDA_FLOAT4_FIELD_ORDER
@ QUDA_FLOAT8_FIELD_ORDER
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
enum QudaGammaBasis_s QudaGammaBasis
__device__ __forceinline__ T __ldg(const T *ptr)
#define safe_malloc(size)
__host__ constexpr __device__ bool match< int, int >()
__host__ constexpr __device__ bool match()
__host__ constexpr __device__ bool fixed_point< float, short >()
__host__ constexpr __device__ bool fixed_point< float, int8_t >()
__device__ __host__ int indexFloatN(int x_cb, int s, int c, int v, int stride)
__host__ constexpr __device__ bool fixed_point< float, int >()
__host__ constexpr __device__ bool match< int8_t, int8_t >()
__host__ constexpr __device__ bool match< short, short >()
__host__ constexpr __device__ bool fixed_point()
void transform_reduce(Arg &arg)
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ __device__ ValueType abs(ValueType x)
FloatingPoint< float > Float
#define qudaMemcpy(dst, src, count, kind)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
Provides precision abstractions and defines the register precision given the storage precision using ...
__device__ __host__ ColorSpinor< Float, Nc, Ns > & operator=(const ColorSpinor< Float, Nc, Ns > &a)
__device__ __host__ ColorSpinor()
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
AccessorCB(const ColorSpinorField &field)
__device__ __host__ void load(complex< Float > out[nSpinBlock *nColor *nVec], complex< Float > *in, int parity, int x_cb, int chi) const
__device__ __host__ void load(complex< Float > out[nSpinBlock *nColor *nVec], complex< Float > *in, int parity, int x_cb, int chi) const
AccessorCB(const ColorSpinorField &field)
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
AccessorCB(const ColorSpinorField &field)
__device__ __host__ int wrap_index(int parity, int x_cb, int s) const
This and the following wrap_index method returns the index for the pointer that points to the start o...
AccessorCB(const ColorSpinorField &field)
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
__device__ __host__ void load(complex< Float > out[nSpinBlock *nColor *nVec], complex< Float > *in, int parity, int x_cb, int chi) const
AccessorCB(const ColorSpinorField &)
__device__ __host__ int index(int parity, int x_cb, int s, int c, int v) const
Accessor routine for ColorSpinorFields in native field order.
void save()
Backup the field to the host when tuning.
void load()
Restore the field from the host after tuning.
__device__ __host__ const colorspinor_ghost_wrapper< real, Accessor > Ghost(int dim, int dir, int ghost_idx, int parity) const
This accessor routine returns a const colorspinor_ghost_wrapper to this object, allowing us to overlo...
__device__ __host__ void loadGhost(complex out[length_ghost/2], int x, int dim, int dir, int parity=0) const
typename AllocType< huge_alloc >::type AllocInt
typename VectorType< Float, N >::type Vector
FloatNOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, norm_type *norm_=0, Float **ghost_=0, bool override=false)
static constexpr int length
__device__ __host__ colorspinor_ghost_wrapper< real, Accessor > Ghost(int dim, int dir, int ghost_idx, int parity)
This accessor routine returns a colorspinor_ghost_wrapper to this object, allowing us to overload var...
__device__ __host__ void save(const complex in[length/2], int x, int parity=0)
void resetGhost(const ColorSpinorField &a, void *const *ghost_) const
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
norm_type * ghost_norm[8]
const AllocInt norm_offset
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
typename VectorType< Float, N_ghost >::type GhostVector
static constexpr int N_ghost
static constexpr int length_ghost
size_t bytes
host memory for backing up the field when tuning
__device__ __host__ void load(complex out[length/2], int x, int parity=0) const
typename mapper< Float >::type real
__device__ __host__ void saveGhost(const complex in[length_ghost/2], int x, int dim, int dir, int parity=0) const
static constexpr int M_ghost
__device__ __host__ int wrap_index(int dim, int dir, int parity, int x_cb, int s) const
This wrap_index method for ghost.
GhostAccessorCB(const ColorSpinorField &a, int nFace=1)
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
GhostAccessorCB(const ColorSpinorField &a, int nFace=1)
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
GhostAccessorCB(const ColorSpinorField &a, int nFace=1)
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
GhostAccessorCB(const ColorSpinorField &a, int nFace=1)
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
GhostAccessorCB(const ColorSpinorField &)
__device__ __host__ int index(int dim, int dir, int parity, int x_cb, int s, int c, int v) const
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dim, int dir, int parity=0)
__device__ __host__ int getPaddedIndex(int x_cb, int parity) const
Compute the index into the padded field. Assumes that parity doesn't change from unpadded to padded.
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
__device__ __host__ void save(const complex v[length/2], int x, int parity=0)
typename mapper< Float >::type real
PaddedSpaceSpinorColorOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, float *dummy=0, Float **ghost_=0)
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
__device__ __host__ void loadGhost(complex v[length/2], int x, int dim, int dir, int parity=0) const
__device__ __host__ void load(complex v[length/2], int x, int parity=0) const
QDPJITDiracOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0)
__device__ __host__ void load(complex v[Ns *Nc], int x, int parity=0) const
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
__device__ __host__ void save(const complex v[Ns *Nc], int x, int parity=0)
typename mapper< Float >::type real
This is just a dummy structure we use for trove to define the required structure size.
SpaceColorSpinorOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, float *dummy=0, Float **ghost_=0)
__device__ __host__ void load(complex v[length/2], int x, int parity=0) const
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
__device__ __host__ void loadGhost(complex v[length/2], int x, int dim, int dir, int parity=0) const
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dim, int dir, int parity=0)
__device__ __host__ void save(const complex v[length/2], int x, int parity=0)
typename mapper< Float >::type real
__device__ __host__ void load(complex v[length/2], int x, int parity=0) const
__device__ __host__ void save(const complex v[length/2], int x, int parity=0)
__device__ __host__ const colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity) const
This accessor routine returns a const colorspinor_wrapper to this object, allowing us to overload var...
typename mapper< Float >::type real
SpaceSpinorColorOrder(const ColorSpinorField &a, int nFace=1, Float *field_=0, float *dummy=0, Float **ghost_=0)
__device__ __host__ colorspinor_wrapper< real, Accessor > operator()(int x_cb, int parity)
This accessor routine returns a colorspinor_wrapper to this object, allowing us to overload various o...
__device__ __host__ void loadGhost(complex v[length/2], int x, int dim, int dir, int parity=0) const
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dim, int dir, int parity=0)
__host__ __device__ Float operator()(const quda::complex< int8_t > &x)
__host__ __device__ Float operator()(const quda::complex< short > &x)
__host__ __device__ Float operator()(const quda::complex< storeFloat > &x)
fieldorder_wrapper is an internal class that is used to wrap instances of FieldOrder accessors,...
complex< storeFloat > * v
__device__ __host__ void operator-=(const complex< theirFloat > &a)
Operator-= with complex number instance as input.
__device__ __host__ Float imag() const
__device__ __host__ Float real() const
__device__ __host__ void operator+=(const complex< theirFloat > &a)
Operator+= with complex number instance as input.
__device__ __host__ complex< Float > operator-() const
negation operator
__device__ __host__ void real(const Float &a)
__device__ __host__ auto data()
returns the pointor of this wrapper object
__device__ __host__ void operator=(const fieldorder_wrapper< Float, storeFloat > &a)
Assignment operator with fieldorder_wrapper instance as input.
__device__ __host__ void imag(const Float &a)
__device__ __host__ void operator=(const theirFloat &a)
Assignment operator with real number instance as input.
__device__ __host__ void operator=(const complex< theirFloat > &a)
Assignment operator with complex number instance as input.
static constexpr bool fixed
__device__ __host__ fieldorder_wrapper(complex< storeFloat > *v, int idx, Float scale, Float scale_inv)
fieldorder_wrapper constructor
__device__ __host__ const auto data() const
__host__ __device__ ReduceType operator()(const quda::complex< int8_t > &x)
square_(ReduceType scale)
__host__ __device__ ReduceType operator()(const quda::complex< short > &x)
square_(ReduceType scale)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
square_(ReduceType scale)
colorspinor_ghost_wrapper is an internal class that is used to wrap instances of colorspinor accessor...
__device__ __host__ void operator=(const C &a)
Assignment operator with Matrix instance as input.
colorspinor::FloatNOrder< double, 1, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< double, 2, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< double, 4, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< double, 4, Nc, 2, true, huge_alloc > type
colorspinor::FloatNOrder< float, 1, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< float, 2, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< float, 4, Nc, 4, false, huge_alloc > type
colorspinor::FloatNOrder< float, 4, Nc, 4, true, huge_alloc > type
colorspinor::FloatNOrder< int8_t, 1, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< int8_t, 2, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< int8_t, 4, Nc, N8, false, huge_alloc > type
colorspinor::FloatNOrder< int8_t, 4, Nc, N8, true, huge_alloc > type
colorspinor::FloatNOrder< short, 1, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< short, 2, Nc, 2, false, huge_alloc > type
colorspinor::FloatNOrder< short, 4, Nc, N8, false, huge_alloc > type
colorspinor::FloatNOrder< short, 4, Nc, N8, true, huge_alloc > type
colorspinor::FloatNOrder< T, Ns, Nc, 2 > type
colorspinor::SpaceColorSpinorOrder< T, Ns, Nc > type
colorspinor::SpaceSpinorColorOrder< T, Ns, Nc > type
colorspinor_wrapper is an internal class that is used to wrap instances of colorspinor accessors,...
__device__ __host__ void operator=(const C &a)
Assignment operator with ColorSpinor instance as input.
__host__ __device__ int8_t imag() const volatile
__host__ __device__ int8_t real() const volatile
__host__ __device__ short real() const volatile
__host__ __device__ short imag() const volatile
__host__ __device__ complex(const ValueType &re=ValueType(), const ValueType &im=ValueType())
__host__ __device__ ValueType imag() const volatile
__host__ __device__ ValueType real() const volatile
__host__ __device__ complex< ValueType > & operator=(const complex< T > z)