8 namespace CloverOrder {
16 #ifdef GPU_CLOVER_DIRAC
18 template<
typename Clover1,
typename Clover2,
typename Gauge>
19 struct CloverTraceArg {
26 CloverTraceArg(Clover1 &clover1, Clover2 &clover2, Gauge &
gauge,
int dir1,
int dir2)
27 : clover1(clover1), clover2(clover2), gauge(gauge), dir1(dir1), dir2(dir2) {}
31 template <
typename Float,
typename Clover1,
typename Clover2,
typename Gauge>
32 __device__ __host__
void cloverSigmaTraceCompute(CloverTraceArg<Clover1, Clover2, Gauge>&
arg,
int x,
int parity)
58 const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
62 arg.clover1.load(A,x,parity);
64 arg.clover2.load(A,x,parity);
67 for(
int ch=0; ch<2; ++ch){
69 for (
int i=0; i<6; i++) diag[ch][i] = 2.0*A[ch*36+i];
70 for (
int i=0; i<15; i++) tri[ch][idtab[i]] = complex<Float>(2.0*A[ch*36+6+2*i], 2.0*A[ch*36+6+2*i+1]);
77 for(
int j=0; j<3; ++j){
78 mat(j,j).y = diag[0][j+3] + diag[1][j+3] - diag[0][j] - diag[1][j];
83 for(
int j=1; j<3; ++j){
84 int jk2 = (j+3)*(j+2)/2 + 3;
85 for(
int k=0; k<j; ++k){
86 ctmp = tri[0][jk2] + tri[1][jk2] - tri[0][jk] - tri[1][jk];
101 for(
int j=0; j<3; ++j){
102 int jk = (j+3)*(j+2)/2;
103 for(
int k=0; k<3; ++k){
104 int kj = (k+3)*(k+2)/2 + j;
105 ctmp =
conj(tri[0][kj]) - tri[0][jk] +
conj(tri[1][kj]) - tri[1][jk];
113 for(
int j=0; j<3; ++j){
114 int jk = (j+3)*(j+2)/2;
115 for(
int k=0; k<3; ++k){
116 int kj = (k+3)*(k+2)/2 + j;
117 ctmp =
conj(tri[0][kj]) + tri[0][jk] -
conj(tri[1][kj]) - tri[1][jk];
128 for(
int j=0; j<3; ++j){
129 int jk = (j+3)*(j+2)/2;
130 for(
int k=0; k<3; ++k){
131 int kj = (k+3)*(k+2)/2 + j;
132 ctmp =
conj(tri[0][kj]) + tri[0][jk] +
conj(tri[1][kj]) + tri[1][jk];
139 for(
int j=0; j<3; ++j){
140 int jk = (j+3)*(j+2)/2;
141 for(
int k=0; k<3; ++k){
142 int kj = (k+3)*(k+2)/2 + j;
143 ctmp =
conj(tri[0][kj]) - tri[0][jk] -
conj(tri[1][kj]) + tri[1][jk];
153 for(
int j=0; j<3; ++j){
154 mat(j,j).y = diag[0][j] - diag[0][j+3] - diag[1][j] + diag[1][j+3];
157 for(
int j=1; j<3; ++j){
158 int jk2 = (j+3)*(j+2)/2 + 3;
159 for(
int k=0; k<j; ++k){
160 ctmp = tri[0][jk] - tri[0][jk2] - tri[1][jk] + tri[1][jk2];
174 arg.gauge.save((
Float*)(mat.
data), x, 0, parity);
179 template<
typename Float,
typename Clover1,
typename Clover2,
typename Gauge>
180 void cloverSigmaTrace(CloverTraceArg<Clover1,Clover2,Gauge> arg)
182 for(
int x=0; x<arg.clover1.volumeCB; x++){
183 cloverSigmaTraceCompute<Float,Clover1,Clover2,Gauge>(
arg,
x, 1);
189 template<
typename Float,
typename Clover1,
typename Clover2,
typename Gauge>
190 __global__
void cloverSigmaTraceKernel(CloverTraceArg<Clover1,Clover2,Gauge> arg)
192 int idx = blockIdx.x*blockDim.x + threadIdx.x;
193 if(idx >= arg.clover1.volumeCB)
return;
195 cloverSigmaTraceCompute<Float,Clover1,Clover2,Gauge>(
arg,
idx, 1);
198 template<
typename Float,
typename Clover1,
typename Clover2,
typename Gauge>
199 class CloverSigmaTrace :
Tunable {
200 CloverTraceArg<Clover1,Clover2,Gauge>
arg;
205 unsigned int sharedBytesPerThread()
const {
return 0; }
206 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
208 bool tuneSharedBytes()
const {
return false; }
209 bool tuneGridDim()
const {
return false; }
210 unsigned int minThreads()
const {
return arg.clover1.volumeCB; }
214 : arg(arg), meta(meta), location(location) {
215 writeAuxString(
"stride=%d", arg.clover1.stride);
217 virtual ~CloverSigmaTrace() {;}
219 void apply(
const cudaStream_t &
stream){
221 #if (__COMPUTE_CAPABILITY__ >= 200)
222 dim3 blockDim(128, 1, 1);
223 dim3 gridDim((arg.clover1.volumeCB + blockDim.x - 1)/blockDim.x, 1, 1);
224 cloverSigmaTraceKernel<Float,Clover1,Clover2,Gauge><<<gridDim,blockDim,0>>>(
arg);
226 errorQuda(
"cloverSigmaTrace not supported on pre-Fermi architecture");
229 cloverSigmaTrace<Float,Clover1,Clover2,Gauge>(
arg);
233 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
236 std::stringstream ps;
237 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
242 long long flops()
const {
return 0; }
243 long long bytes()
const {
return 0; }
248 template<
typename Float,
typename Clover1,
typename Clover2,
typename Gauge>
252 CloverTraceArg<Clover1, Clover2, Gauge>
arg(clover1, clover2, gauge, dir1, dir2);
253 CloverSigmaTrace<Float,Clover1,Clover2,Gauge> traceCompute(arg, meta, location);
254 traceCompute.apply(0);
255 cudaDeviceSynchronize();
261 template<
typename Float>
313 #ifdef GPU_CLOVER_DIRAC
315 errorQuda(
"Half precision not supported\n");
319 computeCloverSigmaTrace<float>(
gauge, clover, dir1, dir2,
location);
321 computeCloverSigmaTrace<double>(
gauge, clover, dir1, dir2,
location);
__device__ __host__ void setZero(Matrix< T, N > *m)
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
std::complex< double > Complex
QudaGaugeFieldOrder Order() const
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
__host__ __device__ ValueType imag() const volatile
QudaPrecision Precision() const
cudaColorSpinorField * tmp
const QudaFieldLocation location
FloatingPoint< float > Float
QudaCloverFieldOrder Order() const
QudaReconstructType Reconstruct() const
__host__ __device__ ValueType real() const volatile
void computeCloverSigmaTrace(GaugeField &gauge, const CloverField &clover, int dir1, int dir2, QudaFieldLocation location)
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.