QUDA  v1.1.0
A library for QCD on GPUs
clover_trace_quda.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <clover_field.h>
5 #include <gauge_field.h>
6 #include <gauge_field_order.h>
7 #include <clover_field_order.h>
8 
9 namespace quda {
10 
11 #ifdef GPU_CLOVER_DIRAC
12 
13  template<typename Float, typename Clover1, typename Clover2, typename Gauge>
14  struct CloverTraceArg {
15  Clover1 clover1;
16  Clover2 clover2;
17  Gauge gauge;
18  Float coeff;
19 
20  CloverTraceArg(Clover1 &clover1, Clover2 &clover2, Gauge &gauge, Float coeff)
21  : clover1(clover1), clover2(clover2), gauge(gauge), coeff(coeff) {}
22  };
23 
24 
25  template <typename Float, typename Arg>
26  __device__ __host__ void cloverSigmaTraceCompute(Arg & arg, const int x, int parity)
27  {
28 
29  Float A[72];
30  if (parity==0) arg.clover1.load(A,x,parity);
31  else arg.clover2.load(A,x,parity);
32 
33  // load the clover term into memory
34  for (int mu=0; mu<4; mu++) {
35  for (int nu=0; nu<mu; nu++) {
36 
37  Matrix<complex<Float>,3> mat;
38  setZero(&mat);
39 
40  Float diag[2][6];
41  complex<Float> tri[2][15];
42  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
43  complex<Float> ctmp;
44 
45  for (int ch=0; ch<2; ++ch) {
46  // factor of two is inherent to QUDA clover storage
47  for (int i=0; i<6; i++) diag[ch][i] = 2.0*A[ch*36+i];
48  for (int i=0; i<15; i++) tri[ch][idtab[i]] = complex<Float>(2.0*A[ch*36+6+2*i], 2.0*A[ch*36+6+2*i+1]);
49  }
50 
51  // X, Y
52  if (nu == 0) {
53  if (mu == 1) {
54  for (int j=0; j<3; ++j) {
55  mat(j,j).y = diag[0][j+3] + diag[1][j+3] - diag[0][j] - diag[1][j];
56  }
57 
58  // triangular part
59  int jk=0;
60  for (int j=1; j<3; ++j) {
61  int jk2 = (j+3)*(j+2)/2 + 3;
62  for (int k=0; k<j; ++k) {
63  ctmp = tri[0][jk2] + tri[1][jk2] - tri[0][jk] - tri[1][jk];
64 
65  mat(j,k).x = -ctmp.imag();
66  mat(j,k).y = ctmp.real();
67  mat(k,j).x = ctmp.imag();
68  mat(k,j).y = ctmp.real();
69 
70  jk++; jk2++;
71  }
72  } // X Y
73 
74  } else if (mu == 2) {
75 
76  for (int j=0; j<3; ++j) {
77  int jk = (j+3)*(j+2)/2;
78  for (int k=0; k<3; ++k) {
79  int kj = (k+3)*(k+2)/2 + j;
80  mat(j,k) = conj(tri[0][kj]) - tri[0][jk] + conj(tri[1][kj]) - tri[1][jk];
81  jk++;
82  }
83  } // X Z
84 
85  } else if (mu == 3) {
86  for (int j=0; j<3; ++j) {
87  int jk = (j+3)*(j+2)/2;
88  for (int k=0; k<3; ++k) {
89  int kj = (k+3)*(k+2)/2 + j;
90  ctmp = conj(tri[0][kj]) + tri[0][jk] - conj(tri[1][kj]) - tri[1][jk];
91  mat(j,k).x = -ctmp.imag();
92  mat(j,k).y = ctmp.real();
93  jk++;
94  }
95  }
96  } // mu == 3 // X T
97  } else if (nu == 1) {
98  if (mu == 2) { // Y Z
99  for (int j=0; j<3; ++j) {
100  int jk = (j+3)*(j+2)/2;
101  for (int k=0; k<3; ++k) {
102  int kj = (k+3)*(k+2)/2 + j;
103  ctmp = conj(tri[0][kj]) + tri[0][jk] + conj(tri[1][kj]) + tri[1][jk];
104  mat(j,k).x = ctmp.imag();
105  mat(j,k).y = -ctmp.real();
106  jk++;
107  }
108  }
109  } else if (mu == 3){ // Y T
110  for (int j=0; j<3; ++j) {
111  int jk = (j+3)*(j+2)/2;
112  for (int k=0; k<3; ++k) {
113  int kj = (k+3)*(k+2)/2 + j;
114  mat(j,k) = conj(tri[0][kj]) - tri[0][jk] - conj(tri[1][kj]) + tri[1][jk];
115  jk++;
116  }
117  }
118  } // mu == 3
119  } // nu == 1
120  else if (nu == 2){
121  if (mu == 3) {
122  for (int j=0; j<3; ++j) {
123  mat(j,j).y = diag[0][j] - diag[0][j+3] - diag[1][j] + diag[1][j+3];
124  }
125  int jk=0;
126  for (int j=1; j<3; ++j) {
127  int jk2 = (j+3)*(j+2)/2 + 3;
128  for (int k=0; k<j; ++k) {
129  ctmp = tri[0][jk] - tri[0][jk2] - tri[1][jk] + tri[1][jk2];
130  mat(j,k).x = -ctmp.imag();
131  mat(j,k).y = ctmp.real();
132 
133  mat(k,j).x = ctmp.imag();
134  mat(k,j).y = ctmp.real();
135  jk++; jk2++;
136  }
137  }
138  }
139  }
140 
141  mat *= arg.coeff;
142  arg.gauge((mu-1)*mu/2 + nu, x, parity) = mat;
143  } // nu
144  } // mu
145 
146  return;
147  }
148 
149  template<typename Float, typename Arg>
150  void cloverSigmaTrace(Arg &arg)
151  {
152  for (int x=0; x<arg.clover1.volumeCB; x++) {
153  cloverSigmaTraceCompute<Float,Arg>(arg, x, 1);
154  }
155  return;
156  }
157 
158 
159  template<typename Float, typename Arg>
160  __global__ void cloverSigmaTraceKernel(Arg arg)
161  {
162  int idx = blockIdx.x*blockDim.x + threadIdx.x;
163  if (idx >= arg.clover1.volumeCB) return;
164  // odd parity
165  cloverSigmaTraceCompute<Float,Arg>(arg, idx, 1);
166  }
167 
168  template<typename Float, typename Arg>
169  class CloverSigmaTrace : Tunable {
170  Arg &arg;
171  const GaugeField &meta;
172 
173  private:
174  unsigned int sharedBytesPerThread() const { return 0; }
175  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
176 
177  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory
178  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
179  unsigned int minThreads() const { return arg.clover1.volumeCB; }
180 
181  public:
182  CloverSigmaTrace(Arg &arg, const GaugeField &meta)
183  : arg(arg), meta(meta) {
184  writeAuxString("stride=%d", arg.clover1.stride);
185  }
186  virtual ~CloverSigmaTrace() {;}
187 
188  void apply(const qudaStream_t &stream){
189  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
190  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
191  qudaLaunchKernel(cloverSigmaTraceKernel<Float,Arg>, tp, stream, arg);
192  } else {
193  cloverSigmaTrace<Float,Arg>(arg);
194  }
195  }
196 
197  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
198 
199  long long flops() const { return 0; } // Fix this
200  long long bytes() const { return (arg.clover1.Bytes() + 6*arg.gauge.Bytes()) * arg.clover1.volumeCB; }
201 
202  }; // CloverSigmaTrace
203 
204 
205  template<typename Float, typename Clover1, typename Clover2, typename Gauge>
206  void computeCloverSigmaTrace(Clover1 clover1, Clover2 clover2, Gauge gauge,
207  const GaugeField &meta, Float coeff)
208  {
209  typedef CloverTraceArg<Float, Clover1, Clover2, Gauge> Arg;
210  Arg arg(clover1, clover2, gauge, coeff);
211  CloverSigmaTrace<Float, Arg> traceCompute(arg, meta);
212  traceCompute.apply(0);
213  return;
214  }
215 
216  template<typename Float>
217  void computeCloverSigmaTrace(GaugeField& gauge, const CloverField& clover, Float coeff){
218 
219  if(clover.isNative()) {
220  typedef typename clover_mapper<Float>::type C;
221  if (gauge.isNative()) {
222  if (gauge.Reconstruct() == QUDA_RECONSTRUCT_NO) {
223  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
224  computeCloverSigmaTrace<Float>( C(clover,0), C(clover,1), G(gauge), gauge, coeff);
225  } else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12) {
226  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
227  computeCloverSigmaTrace<Float>( C(clover,0), C(clover,1), G(gauge), gauge, coeff);
228  } else {
229  errorQuda("Reconstruction type %d not supported", gauge.Reconstruct());
230  }
231  } else {
232  errorQuda("Gauge order %d not supported", gauge.Order());
233  }
234  } else {
235  errorQuda("clover order %d not supported", clover.Order());
236  } // clover order
237 
238  }
239 
240 #endif
241 
242  void computeCloverSigmaTrace(GaugeField& output, const CloverField& clover, double coeff) {
243 
244 #ifdef GPU_CLOVER_DIRAC
245  if (clover.Precision() == QUDA_SINGLE_PRECISION) {
246  computeCloverSigmaTrace<float>(output, clover, static_cast<float>(coeff));
247  } else if (clover.Precision() == QUDA_DOUBLE_PRECISION){
248  computeCloverSigmaTrace<double>(output, clover, coeff);
249  } else {
250  errorQuda("Precision %d not supported", clover.Precision());
251  }
252 #else
253  errorQuda("Clover has not been built");
254 #endif
255 
256  }
257 
258 
259 } // namespace quda