QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
clover_quda.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <clover_field.h>
4 #include <gauge_field.h>
5 
6 namespace quda {
7 
8  struct CloverParam {
9  int threads; // number of active threads required
10  int X[4]; // grid dimensions
11  int gaugeLengthCB; // half length (include checkerboard spacetime, color, complex, direction)
12  int gaugeStride; // stride used on gauge field
13  int FmunuLengthCB; // half length (include checkerboard spacetime, color, complex, direction)
14  int FmunuStride; // stride used on Fmunu field
15  };
16 
25  __device__ inline int linkIndex(int x[], int dx[], const CloverParam &param) {
26  int y[4];
27  for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + param.X[i]-1) % param.X[i];
28  int idx = (((y[3]*param.X[2] + y[2])*param.X[1] + y[1])*param.X[0] + y[0]) >> 1;
29  return idx;
30  }
31 
40  template <typename Cmplx>
41  __global__ void computeFmunuKernel(Cmplx *Fmunu, const Cmplx *gauge, const CloverParam param) {
42 
43  // this is a full lattice index
44  int X = blockIdx.x * blockDim.x + threadIdx.x;
45  if (X >= param.threads) return;
46 
47  // thread ordering
48  // X = (((parity*X4 + x4)*X3 + x3)*X2 + x2)*X1h + x1h
49  // with x1 = 2*x1h + parity
50 
51  // For multi-gpu these need to have offets added on
52  // keep X[d] as the true local problem
53 
54  // compute spacetime dimensions and parity
55  int aux1 = X / (param.X[0]/2);
56  int x[4];
57  x[0] = X - aux1 * (param.X[0]/2); // this is chbd x
58  int aux2 = aux1 / param.X[1];
59  x[1] = aux1 - aux2 * param.X[1];
60  int aux3 = aux2 / param.X[2];
61  x[2] = aux2 - aux3 * param.X[2];
62  int parity = aux3 / param.X[3];
63  x[3] = aux3 - parity * param.X[3];
64  x[0] += parity; // now this is the full index
65 
66  for (int mu=0; mu<4; mu++) {
67  for (int nu=0; nu<mu; nu++) {
69  setZero(&F);
70 
71  { // positive mu, nu
72 
73  // load U(x)_(+mu)
74  Matrix<Cmplx,3> U1;
75  int dx[4] = {0, 0, 0, 0};
76  loadLinkVariableFromArray(gauge+parity*param.gaugeLengthCB, mu, linkIndex(x, dx, param),
77  param.gaugeStride, &U1);
78 
79  // load U(x+mu)_(+nu)
80  Matrix<Cmplx,3> U2;
81  dx[mu]++;
82  loadLinkVariableFromArray(gauge+((parity+1)&1)*param.gaugeLengthCB, nu, linkIndex(x,dx,param),
83  param.gaugeStride, &U2);
84  dx[mu]--;
85 
86  Matrix<Cmplx,3> Ftmp = U2 * U1;
87 
88  // load U(x+nu)_(+mu)
89  Matrix<Cmplx,3> U3;
90  dx[nu]++;
91  loadLinkVariableFromArray(gauge+((parity+1)&1)*param.gaugeLengthCB, mu, linkIndex(x,dx,param),
92  param.gaugeStride, &U3);
93  dx[nu]--;
94 
95  Ftmp = conj(U3) * Ftmp;
96 
97  // load U(x)_(+nu)
98  Matrix<Cmplx,3> U4;
99  loadLinkVariableFromArray(gauge+parity*param.gaugeLengthCB, nu, linkIndex(x,dx,param),
100  param.gaugeStride, &U4);
101 
102  // complete the plaquette
103  Ftmp = conj(U4) * Ftmp;
104 
105  // sum this contribution to Fmunu
106  F += Ftmp + conj(Ftmp);
107  }
108 
109  { // positive mu, negative nu
110 
111  // load U(x)_(+mu)
112  Matrix<Cmplx,3> U1;
113  int dx[4] = {0, 0, 0, 0};
114  loadLinkVariableFromArray(gauge+parity*param.gaugeLengthCB, mu, linkIndex(x, dx, param),
115  param.gaugeStride, &U1);
116 
117  // load U(x+mu)_(-nu) = U(x+mu-nu)_(+nu)
118  Matrix<Cmplx,3> U2;
119  dx[mu]++;
120  dx[nu]--;
121  loadLinkVariableFromArray(gauge+((parity+1)&1)*param.gaugeLengthCB, nu, linkIndex(x,dx,param),
122  param.gaugeStride, &U2);
123  dx[nu]++;
124  dx[mu]--;
125 
126  Matrix<Cmplx,3> Ftmp = conj(U2) * U1;
127 
128  // load U(x-nu)_mu
129  Matrix<Cmplx,3> U3;
130  dx[nu]--;
131  loadLinkVariableFromArray(gauge+((parity+1)&1)*param.gaugeLengthCB, mu, linkIndex(x,dx,param),
132  param.gaugeStride, &U3);
133  dx[nu]++;
134 
135  Ftmp = conj(U3) * Ftmp;
136 
137  // load U(x)_(-nu) = U(x-nu)_(+nu)
138  Matrix<Cmplx,3> U4;
139  dx[nu]--;
140  loadLinkVariableFromArray(gauge+parity*param.gaugeLengthCB, nu, linkIndex(x,dx,param),
141  param.gaugeStride, &U4);
142  dx[nu]++;
143 
144  // complete the plaquette
145  Ftmp = U4 * Ftmp;
146 
147  // sum this contribution to Fmunu
148  F += Ftmp + conj(Ftmp);
149  }
150 
151 
152  { // negative mu, positive nu
153 
154  // load U(x)_(-mu)
155  Matrix<Cmplx,3> U1;
156  int dx[4] = {0, 0, 0, 0};
157  dx[mu]--;
158  loadLinkVariableFromArray(gauge+parity*param.gaugeLengthCB, mu, linkIndex(x, dx, param),
159  param.gaugeStride, &U1);
160  dx[mu]++;
161 
162  // load U(x-mu)_(+nu)
163  Matrix<Cmplx,3> U2;
164  dx[mu]--;
165  loadLinkVariableFromArray(gauge+((parity+1)&1)*param.gaugeLengthCB, nu, linkIndex(x,dx,param),
166  param.gaugeStride, &U2);
167  dx[mu]++;
168 
169  Matrix<Cmplx,3> Ftmp = U2 * conj(U1);
170 
171  // load U(x+nu)_(+mu)
172  Matrix<Cmplx,3> U3;
173  dx[nu]++;
174  dx[mu]--;
175  loadLinkVariableFromArray(gauge+((parity+1)&1)*param.gaugeLengthCB, mu, linkIndex(x,dx,param),
176  param.gaugeStride, &U3);
177  dx[mu]++;
178  dx[nu]--;
179 
180  Ftmp = U3 * Ftmp;
181 
182  // load U(x)_(+nu)
183  Matrix<Cmplx,3> U4;
184  loadLinkVariableFromArray(gauge+parity*param.gaugeLengthCB, nu, linkIndex(x,dx,param),
185  param.gaugeStride, &U4);
186 
187  // complete the plaquette
188  Ftmp = conj(U4) * Ftmp;
189 
190  // sum this contribution to Fmunu
191  F += Ftmp + conj(Ftmp);
192  }
193 
194  { // negative mu, negative nu
195 
196  // load U(x)_(-mu)
197  Matrix<Cmplx,3> U1;
198  int dx[4] = {0, 0, 0, 0};
199  dx[mu]--;
200  loadLinkVariableFromArray(gauge+parity*param.gaugeLengthCB, mu, linkIndex(x, dx, param),
201  param.gaugeStride, &U1);
202  dx[mu]++;
203 
204  // load U(x-mu)_(-nu) = U(x-mu-nu)_(+nu)
205  Matrix<Cmplx,3> U2;
206  dx[mu]--;
207  dx[nu]--;
208  loadLinkVariableFromArray(gauge+((parity+1)&1)*param.gaugeLengthCB, nu, linkIndex(x,dx,param),
209  param.gaugeStride, &U2);
210  dx[nu]++;
211  dx[mu]++;
212 
213  Matrix<Cmplx,3> Ftmp = conj(U2) * conj(U1);
214 
215  // load U(x-nu)_mu
216  Matrix<Cmplx,3> U3;
217  dx[mu]--;
218  dx[nu]--;
219  loadLinkVariableFromArray(gauge+((parity+1)&1)*param.gaugeLengthCB, mu, linkIndex(x,dx,param),
220  param.gaugeStride, &U3);
221  dx[nu]++;
222  dx[mu]++;
223 
224  Ftmp = U3 * Ftmp;
225 
226  // load U(x)_(-nu) = U(x-nu)_(+nu)
227  Matrix<Cmplx,3> U4;
228  dx[nu]--;
229  loadLinkVariableFromArray(gauge+parity*param.gaugeLengthCB, nu, linkIndex(x,dx,param),
230  param.gaugeStride, &U4);
231  dx[nu]++;
232 
233  // complete the plaquette
234  Ftmp = U4 * Ftmp;
235 
236  // sum this contribution to Fmunu
237  F += Ftmp + conj(Ftmp);
238  }
239 
240 
241  int munu_idx = (mu*(mu-1))/2 + nu; // lower-triangular indexing
242  writeLinkVariableToArray(F, munu_idx, X/2, param.FmunuStride, Fmunu+parity*param.FmunuLengthCB);
243  }
244  }
245 
246  }
247 
249 
250 #ifdef GPU_CLOVER_DIRAC
251  using namespace quda;
252 
253  // first create the field-strength tensor
254  int pad = 0;
255  GaugeFieldParam tensor_param(gauge.X(), gauge.Precision(), QUDA_RECONSTRUCT_NO, pad, QUDA_TENSOR_GEOMETRY);
256  cudaGaugeField Fmunu(tensor_param);
257 
258  // set the kernel parameters
260  param.threads = gauge.Volume();
261  for (int i=0; i<4; i++) param.X[i] = gauge.X()[i];
262  param.gaugeLengthCB = gauge.Length() / 2;
263  param.gaugeStride = gauge.Stride();
264  param.FmunuLengthCB = Fmunu.Length() / 2;
265  param.FmunuStride = Fmunu.Stride();
266 
267  dim3 blockDim(32, 1, 1);
268  dim3 gridDim((param.threads + blockDim.x - 1) / blockDim.x, 1, 1);
269 
270  if (gauge.Precision() == QUDA_DOUBLE_PRECISION) {
271  computeFmunuKernel<<<gridDim,blockDim>>>((double2*)Fmunu.Gauge_p(), (double2*)gauge.Gauge_p(), param);
272  } else {
273  computeFmunuKernel<<<gridDim,blockDim>>>((float2*)Fmunu.Gauge_p(), (float2*)gauge.Gauge_p(), param);
274  }
275 
276  // Now contract this into the clover term
277 
278 
279 #else
280  errorQuda("Clover dslash has not been built");
281 #endif
282 
283  }
284 
285 } // namespace quda
286