4 template <
typename Order,
int nDim>
8 unsigned short X[nDim];
9 unsigned short A[nDim];
10 unsigned short B[nDim];
11 unsigned short C[nDim];
15 const int *B_,
const int *C_,
const int f_[nDim][nDim],
const int *localParity_)
16 : order(order), nFace(nFace) {
17 for (
int d=0; d<nDim; d++) {
22 for (
int e=0; e<nDim; e++)
f[d][e] = f_[d][e];
32 template <
typename Float,
int length,
int nDim,
typename Order>
46 for (
int a=0; a<arg.
A[
dim]; a++) {
47 for (
int b=0; b<arg.
B[
dim]; b++) {
48 for (
int c=0; c<arg.
C[
dim]; c++) {
50 int indexCB = (a*arg.
f[
dim][0] + b*arg.
f[
dim][1] + c*arg.
f[
dim][2] + d*arg.
f[
dim][3]) >> 1;
52 int oddness = (a+b+c+d) & 1;
65 assert(indexDst == arg.
order.faceVolumeCB[
dim]);
77 template <
typename Float,
int length,
int nDim,
typename Order>
85 int X = blockIdx.x * blockDim.x + threadIdx.x;
87 if (X >= 2*arg.
order.faceVolumeCB[
dim])
continue;
89 int dab = X/arg.
C[
dim];
90 int c = X - dab*arg.
C[
dim];
91 int da = dab/arg.
B[
dim];
92 int b = dab - da*arg.
B[
dim];
93 int d = da / arg.
A[
dim];
94 int a = da - d * arg.
A[
dim];
98 int indexCB = (a*arg.
f[
dim][0] + b*arg.
f[
dim][1] + c*arg.
f[
dim][2] + d*arg.
f[
dim][3]) >> 1;
100 int oddness = (a+b+c+d)&1;
113 template <
typename Float,
int length,
int nDim,
typename Order>
120 unsigned int sharedBytesPerThread()
const {
return 0; }
121 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0 ;}
123 bool tuneGridDim()
const {
return false; }
124 unsigned int minThreads()
const {
return size; }
129 for (
int d=0; d<nDim; d++)
130 faceMax = (arg.
order.faceVolumeCB[d] > faceMax )
131 ? arg.
order.faceVolumeCB[d] : faceMax;
140 #if (__COMPUTE_CAPABILITY__ >= 200)
142 extractGhostKernel<Float, length, nDim, Order>
145 errorQuda(
"extractGhost not supported on pre-Fermi architecture");
152 std::stringstream ps;
153 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
158 long long flops()
const {
return 0; }
161 for (
int d=0; d<nDim; d++) sites += arg.order.faceVolumeCB[d];
162 return 2 * sites * 2 * arg.order.Bytes();
171 template <
typename Float,
int length,
typename Order>
173 const int *
X = u.
X();
174 const int nFace = u.
Nface();
179 int A[nDim], B[nDim], C[nDim];
180 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
181 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
182 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
183 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
187 {X[0]*X[1]*X[2], X[0]*X[1], X[0], 1},
188 {X[0]*X[1]*X[2], X[0]*X[1], 1, X[0]},
189 {X[0]*X[1]*X[2], X[0], 1, X[0]*X[1]},
190 { X[0]*X[1], X[0], 1, X[0]*X[1]*X[2]}
197 int localParity[nDim];
204 extractGhost<Float,length,nDim,Order>(
arg);
213 template <
typename Float>
255 #ifdef BUILD_QDP_INTERFACE
258 errorQuda(
"QDP interface has not been built\n");
263 #ifdef BUILD_QDPJIT_INTERFACE
266 errorQuda(
"QDPJIT interface has not been built\n");
271 #ifdef BUILD_CPS_INTERFACE
274 errorQuda(
"CPS interface has not been built\n");
279 #ifdef BUILD_MILC_INTERFACE
282 errorQuda(
"MILC interface has not been built\n");
287 #ifdef BUILD_BQCD_INTERFACE
290 errorQuda(
"BQCD interface has not been built\n");
295 #ifdef BUILD_TIFR_INTERFACE
298 errorQuda(
"TIFR interface has not been built\n");
309 #if __COMPUTE_CAPABILITY__ < 200
311 errorQuda(
"Reconstruct 9/13 not supported on pre-Fermi architecture");
QudaVerbosity getVerbosity()
void extractGaugeGhost(const GaugeField &u, void **ghost)
QudaGaugeFieldOrder Order() const
void extractGhost(ExtractGhostArg< Order, nDim > arg)
QudaPrecision Precision() const
void writeAuxString(const char *format,...)
const QudaFieldLocation location
FloatingPoint< float > Float
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
const char * VolString() const
__global__ void extractGhostKernel(ExtractGhostArg< Order, nDim > arg)
enum QudaFieldLocation_s QudaFieldLocation
QudaLinkType LinkType() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.