QUDA  v1.1.0
A library for QCD on GPUs
clover_field.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <math.h>
5 #include <typeinfo>
6 
7 #include <quda_internal.h>
8 #include <clover_field.h>
9 #include <gauge_field.h>
10 #include <color_spinor_field.h>
11 #include <blas_quda.h>
12 
13 namespace quda {
14 
17  direct(a.V(false)),
18  inverse(a.V(true)),
19  clover(nullptr),
20  norm(nullptr),
21  cloverInv(nullptr),
22  invNorm(nullptr),
23  csw(a.Csw()),
24  coeff(a.Coeff()),
25  twisted(a.Twisted()),
26  mu2(a.Mu2()),
27  rho(a.Rho()),
28  order(a.Order()),
29  create(QUDA_NULL_FIELD_CREATE),
30  location(a.Location())
31  {
32  precision = a.Precision();
33  nDim = a.Ndim();
34  pad = a.Pad();
36  for (int dir = 0; dir < nDim; ++dir) x[dir] = a.X()[dir];
37  }
38 
40  LatticeField(param), bytes(0), norm_bytes(0), nColor(3), nSpin(4),
41  clover(0), norm(0), cloverInv(0), invNorm(0), csw(param.csw), coeff(param.coeff),
42  rho(param.rho), order(param.order), create(param.create), trlog{0, 0}
43  {
44  if (nDim != 4) errorQuda("Number of dimensions must be 4, not %d", nDim);
45 
47  errorQuda("QDPJIT ordered clover fields only supported for reference fields");
48 
49  real_length = 2 * ((size_t)volumeCB) * nColor * nColor * nSpin * nSpin / 2; // block-diagonal Hermitian (72 reals)
50  length = 2 * ((size_t)stride) * nColor * nColor * nSpin * nSpin / 2;
51 
53  if (isNative()) bytes = 2*ALIGNMENT_ADJUST(bytes/2);
55  norm_bytes = sizeof(float)*2*stride*2; // 2 chirality
57  }
58 //for twisted mass only:
59  twisted = false;//param.twisted;
60  mu2 = 0.0; //param.mu2;
61  }
62 
64 
66  {
67 
68  CloverField *field = nullptr;
70  field = new cpuCloverField(param);
71  } else if (param.location == QUDA_CUDA_FIELD_LOCATION) {
72  field = new cudaCloverField(param);
73  } else {
74  errorQuda("Invalid field location %d", param.location);
75  }
76 
77  return field;
78  }
79 
80  void CloverField::setRho(double rho_)
81  {
82  rho = rho_;
83  }
84 
86 
88  errorQuda("Create type %d not supported", create);
89 
90  if (param.direct) {
92  clover = bytes ? pool_device_malloc(bytes) : nullptr;
95  } else {
96  clover = param.clover;
97  norm = param.norm;
98  }
99 
100  even = clover;
101  odd = static_cast<char*>(clover) + bytes/2;
102 
103  evenNorm = norm;
104  oddNorm = static_cast<char*>(norm) + norm_bytes/2;
105 
107 
108  // this is a hack to prevent us allocating a texture object for an unallocated inverse field
109  if (!param.inverse) {
110  cloverInv = clover;
111  evenInv = even;
112  oddInv = odd;
113  invNorm = norm;
114  evenInvNorm = evenNorm;
115  oddInvNorm = oddNorm;
116  }
117  }
118 
119  if (param.inverse) {
121  cloverInv = bytes ? pool_device_malloc(bytes) : nullptr;
124  } else {
125  cloverInv = param.cloverInv;
126  invNorm = param.invNorm;
127  }
128 
129  evenInv = cloverInv;
130  oddInv = static_cast<char*>(cloverInv) + bytes/2;
131 
132  evenInvNorm = invNorm;
133  oddInvNorm = static_cast<char*>(invNorm) + norm_bytes/2;
134 
136 
137  // this is a hack to ensure that we can autotune the clover
138  // operator when just using symmetric preconditioning
139  if (!param.direct) {
140  clover = cloverInv;
141  even = evenInv;
142  odd = oddInv;
143  norm = invNorm;
144  evenNorm = evenInvNorm;
145  oddNorm = oddInvNorm;
146  }
147  }
148 
149  if (!param.inverse) {
150  cloverInv = clover;
151  evenInv = even;
152  oddInv = odd;
153  invNorm = norm;
154  evenInvNorm = evenNorm;
155  oddInvNorm = oddNorm;
156  }
157 
158  twisted = param.twisted;
159  mu2 = param.mu2;
160  }
161 
163  {
165  if (clover != cloverInv) {
167  if (norm) pool_device_free(norm);
168  }
171  }
172  }
173 
174  void cudaCloverField::copy(const CloverField &src, bool inverse) {
175 
176  checkField(src);
177 
178  if (typeid(src) == typeid(cudaCloverField)) {
179  if (src.V(false)) copyGenericClover(*this, src, false, QUDA_CUDA_FIELD_LOCATION);
180  if (src.V(true)) copyGenericClover(*this, src, true, QUDA_CUDA_FIELD_LOCATION);
181  } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION && typeid(src) == typeid(cpuCloverField)) {
182  void *packClover = pool_pinned_malloc(bytes + norm_bytes);
183  void *packCloverNorm = (precision == QUDA_HALF_PRECISION || precision == QUDA_QUARTER_PRECISION) ?
184  static_cast<char *>(packClover) + bytes :
185  0;
186 
187  if (src.V(false)) {
188  copyGenericClover(*this, src, false, QUDA_CPU_FIELD_LOCATION, packClover, 0, packCloverNorm, 0);
189  qudaMemcpy(clover, packClover, bytes, cudaMemcpyHostToDevice);
191  qudaMemcpy(norm, packCloverNorm, norm_bytes, cudaMemcpyHostToDevice);
192  }
193 
194  if (src.V(true) && inverse) {
195  copyGenericClover(*this, src, true, QUDA_CPU_FIELD_LOCATION, packClover, 0, packCloverNorm, 0);
196  qudaMemcpy(cloverInv, packClover, bytes, cudaMemcpyHostToDevice);
198  qudaMemcpy(invNorm, packCloverNorm, norm_bytes, cudaMemcpyHostToDevice);
199  }
200 
201  pool_pinned_free(packClover);
202  } else if (reorder_location() == QUDA_CUDA_FIELD_LOCATION && typeid(src) == typeid(cpuCloverField)) {
203  void *packClover = pool_device_malloc(src.Bytes() + src.NormBytes());
204  void *packCloverNorm = (precision == QUDA_HALF_PRECISION || precision == QUDA_QUARTER_PRECISION) ?
205  static_cast<char *>(packClover) + src.Bytes() :
206  0;
207 
208  if (src.V(false)) {
209  qudaMemcpy(packClover, src.V(false), src.Bytes(), cudaMemcpyHostToDevice);
211  qudaMemcpy(packCloverNorm, src.Norm(false), src.NormBytes(), cudaMemcpyHostToDevice);
212 
213  copyGenericClover(*this, src, false, QUDA_CUDA_FIELD_LOCATION, 0, packClover, 0, packCloverNorm);
214  }
215 
216  if (src.V(true) && inverse) {
217  qudaMemcpy(packClover, src.V(true), src.Bytes(), cudaMemcpyHostToDevice);
219  qudaMemcpy(packCloverNorm, src.Norm(true), src.NormBytes(), cudaMemcpyHostToDevice);
220 
221  copyGenericClover(*this, src, true, QUDA_CUDA_FIELD_LOCATION, 0, packClover, 0, packCloverNorm);
222  }
223 
224  pool_device_free(packClover);
225  } else {
226  errorQuda("Invalid clover field type");
227  }
228 
230  }
231 
233 
235  checkField(cpu);
236 
237  // we know we are copying from GPU to CPU here, so for now just
238  // assume that reordering is on CPU
239  void *packClover = pool_pinned_malloc(bytes + norm_bytes);
240  void *packCloverNorm = (precision == QUDA_HALF_PRECISION) ? static_cast<char*>(packClover) + bytes : 0;
241 
242  // first copy over the direct part (if it exists)
243  if (V(false) && cpu.V(false)) {
244  qudaMemcpy(packClover, clover, bytes, cudaMemcpyDeviceToHost);
246  qudaMemcpy(packCloverNorm, norm, norm_bytes, cudaMemcpyDeviceToHost);
247  copyGenericClover(cpu, *this, false, QUDA_CPU_FIELD_LOCATION, 0, packClover, 0, packCloverNorm);
248  } else if((V(false) && !cpu.V(false)) || (!V(false) && cpu.V(false))) {
249  errorQuda("Mismatch between Clover field GPU V(false) and CPU.V(false)");
250  }
251 
252  // now copy the inverse part (if it exists)
253  if (V(true) && cpu.V(true)) {
254  qudaMemcpy(packClover, cloverInv, bytes, cudaMemcpyDeviceToHost);
256  qudaMemcpy(packCloverNorm, invNorm, norm_bytes, cudaMemcpyDeviceToHost);
257  copyGenericClover(cpu, *this, true, QUDA_CPU_FIELD_LOCATION, 0, packClover, 0, packCloverNorm);
258  } else if ((V(true) && !cpu.V(true)) || (!V(true) && cpu.V(true))) {
259  errorQuda("Mismatch between Clover field GPU V(true) and CPU.V(true)");
260  }
261 
262  pool_pinned_free(packClover);
263 
265  }
266 
267  void cudaCloverField::copy_to_buffer(void *buffer) const
268  {
269 
270  size_t buffer_offset = 0;
271  if (V(false)) { // direct
272  qudaMemcpy(buffer, clover, bytes, cudaMemcpyDeviceToHost);
274  qudaMemcpy(static_cast<char *>(buffer) + bytes, norm, norm_bytes, cudaMemcpyDeviceToHost);
275  }
276  buffer_offset += bytes + norm_bytes;
277  }
278 
279  if (V(true)) { // inverse
280  qudaMemcpy(static_cast<char *>(buffer) + buffer_offset, cloverInv, bytes, cudaMemcpyDeviceToHost);
282  qudaMemcpy(static_cast<char *>(buffer) + buffer_offset + bytes, invNorm, norm_bytes, cudaMemcpyDeviceToHost);
283  }
284  }
285  }
286 
288  {
289 
290  size_t buffer_offset = 0;
291  if (V(false)) { // direct
292  qudaMemcpy(clover, static_cast<char *>(buffer), bytes, cudaMemcpyHostToDevice);
294  qudaMemcpy(norm, static_cast<char *>(buffer) + bytes, norm_bytes, cudaMemcpyHostToDevice);
295  }
296  buffer_offset += bytes + norm_bytes;
297  }
298 
299  if (V(true)) { // inverse
300  qudaMemcpy(cloverInv, static_cast<char *>(buffer) + buffer_offset, bytes, cudaMemcpyHostToDevice);
302  qudaMemcpy(invNorm, static_cast<char *>(buffer) + buffer_offset + bytes, norm_bytes, cudaMemcpyHostToDevice);
303  }
304  }
305  }
306 
308  {
310  }
311 
313  QudaParity parity) const
314  {
315  if (is_prefetch_enabled()) {
316  auto clover_parity = clover;
317  auto norm_parity = norm;
318  auto cloverInv_parity = cloverInv;
319  auto invNorm_parity = invNorm;
320  auto bytes_parity = bytes;
321  auto norm_bytes_parity = norm_bytes;
322  if (parity != QUDA_INVALID_PARITY) {
323  bytes_parity /= 2;
324  norm_bytes_parity /= 2;
325  if (parity == QUDA_EVEN_PARITY) {
326  clover_parity = even;
327  norm_parity = evenNorm;
328  cloverInv_parity = evenInv;
329  invNorm_parity = evenInvNorm;
330  } else { // odd
331  clover_parity = odd;
332  norm_parity = oddNorm;
333  cloverInv_parity = oddInv;
334  invNorm_parity = oddInvNorm;
335  }
336  }
337 
338  switch (type) {
340  if (clover_parity) qudaMemPrefetchAsync(clover_parity, bytes_parity, mem_space, stream);
341  if (norm_parity) qudaMemPrefetchAsync(norm_parity, norm_bytes_parity, mem_space, stream);
342  if (clover_parity != cloverInv_parity) {
343  if (cloverInv_parity) qudaMemPrefetchAsync(cloverInv_parity, bytes_parity, mem_space, stream);
344  if (invNorm_parity) qudaMemPrefetchAsync(invNorm_parity, norm_bytes_parity, mem_space, stream);
345  }
346  break;
348  if (clover_parity) qudaMemPrefetchAsync(clover_parity, bytes_parity, mem_space, stream);
349  if (norm_parity) qudaMemPrefetchAsync(norm_parity, norm_bytes_parity, mem_space, stream);
350  break;
352  if (cloverInv_parity) qudaMemPrefetchAsync(cloverInv_parity, bytes_parity, mem_space, stream);
353  if (invNorm_parity) qudaMemPrefetchAsync(invNorm_parity, norm_bytes_parity, mem_space, stream);
354  break;
355  default: errorQuda("Invalid CloverPrefetchType.");
356  }
357  }
358  }
359 
363  void cudaCloverField::compute(const cudaGaugeField &gauge) { computeClover(*this, gauge, 1.0); }
364 
366 
368  if(order != QUDA_PACKED_CLOVER_ORDER) {errorQuda("cpuCloverField only supports QUDA_PACKED_CLOVER_ORDER");}
369  clover = (void *) safe_malloc(bytes);
371  if(param.inverse) {
372  cloverInv = (void *) safe_malloc(bytes);
374  }
375 
377  memset(clover, '\0', bytes);
378  if(param.inverse) memset(cloverInv, '\0', bytes);
380  if(param.inverse && precision ==QUDA_HALF_PRECISION) memset(invNorm, '\0', norm_bytes);
381  }
382  } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
383  clover = param.clover;
384  norm = param.norm;
385  cloverInv = param.cloverInv;
386  invNorm = param.invNorm;
387  } else {
388  errorQuda("Create type %d not supported", create);
389  }
390 
391  if (param.pad != 0) errorQuda("%s pad must be zero", __func__);
392  }
393 
395  {
397  if (clover) host_free(clover);
398  if (norm) host_free(norm);
400  if (invNorm) host_free(invNorm);
401  }
402  }
403 
404  void cpuCloverField::copy_to_buffer(void *buffer) const
405  {
406 
407  size_t buffer_offset = 0;
408  if (V(false)) { // direct
409  std::memcpy(static_cast<char *>(buffer), clover, bytes);
410  if (precision < QUDA_SINGLE_PRECISION) { std::memcpy(static_cast<char *>(buffer) + bytes, norm, norm_bytes); }
411  buffer_offset += bytes + norm_bytes;
412  }
413 
414  if (V(true)) { // inverse
415  std::memcpy(static_cast<char *>(buffer) + buffer_offset, cloverInv, bytes);
417  std::memcpy(static_cast<char *>(buffer) + buffer_offset + bytes, invNorm, norm_bytes);
418  }
419  }
420  }
421 
423  {
424 
425  size_t buffer_offset = 0;
426  if (V(false)) { // direct
427  std::memcpy(clover, static_cast<char *>(buffer), bytes);
428  if (precision < QUDA_SINGLE_PRECISION) { std::memcpy(norm, static_cast<char *>(buffer) + bytes, norm_bytes); }
429  buffer_offset += bytes + norm_bytes;
430  }
431 
432  if (V(true)) { // inverse
433  std::memcpy(cloverInv, static_cast<char *>(buffer) + buffer_offset, bytes);
435  std::memcpy(invNorm, static_cast<char *>(buffer) + buffer_offset + bytes, norm_bytes);
436  }
437  }
438  }
439 
440  // This doesn't really live here, but is fine for the moment
441  std::ostream& operator<<(std::ostream& output, const CloverFieldParam& param)
442  {
443  output << static_cast<const LatticeFieldParam&>(param);
444  output << "direct = " << param.direct << std::endl;
445  output << "inverse = " << param.inverse << std::endl;
446  output << "clover = " << param.clover << std::endl;
447  output << "norm = " << param.norm << std::endl;
448  output << "cloverInv = " << param.cloverInv << std::endl;
449  output << "invNorm = " << param.invNorm << std::endl;
450  output << "csw = " << param.csw << std::endl;
451  output << "coeff = " << param.coeff << std::endl;
452  output << "twisted = " << param.twisted << std::endl;
453  output << "mu2 = " << param.mu2 << std::endl;
454  output << "rho = " << param.rho << std::endl;
455  output << "order = " << param.order << std::endl;
456  output << "create = " << param.create << std::endl;
457  return output; // for multiple << operators.
458  }
459 
461 
462  if (a.Precision() == QUDA_HALF_PRECISION)
463  errorQuda("Casting a CloverField into ColorSpinorField not possible in half precision");
464 
465  ColorSpinorParam spinor_param;
466  // 72 = 9 * 4 * 2
467  spinor_param.nColor = 9;
468  spinor_param.nSpin = 4;
469  spinor_param.nDim = a.Ndim();
470  for (int d=0; d<a.Ndim(); d++) spinor_param.x[d] = a.X()[d];
471  spinor_param.setPrecision(a.Precision());
472  spinor_param.pad = a.Pad();
473  spinor_param.siteSubset = QUDA_FULL_SITE_SUBSET;
474  spinor_param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
475  spinor_param.fieldOrder = a.Precision() == QUDA_DOUBLE_PRECISION ?
477  spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
478  spinor_param.create = QUDA_REFERENCE_FIELD_CREATE;
479  spinor_param.v = (void*)a.V(inverse);
480  spinor_param.location = a.Location();
481  return spinor_param;
482  }
483 
484  // Return the L2 norm squared of the clover field
485  double norm2(const CloverField &a, bool inverse) {
487  double nrm2 = blas::norm2(*b);
488  delete b;
489  return nrm2;
490  }
491 
492  // Return the L1 norm of the clover field
493  double norm1(const CloverField &a, bool inverse) {
495  double nrm1 = blas::norm1(*b);
496  delete b;
497  return nrm1;
498  }
499 
500 } // namespace quda
void * Norm(bool inverse=false)
Definition: clover_field.h:139
QudaCloverFieldOrder order
Definition: clover_field.h:127
static CloverField * Create(const CloverFieldParam &param)
size_t NormBytes() const
Definition: clover_field.h:172
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch),...
void * V(bool inverse=false)
Definition: clover_field.h:138
bool isNative() const
Definition: clover_field.h:152
QudaFieldCreate create
Definition: clover_field.h:128
CloverField(const CloverFieldParam &param)
virtual ~CloverField()
size_t Bytes() const
Definition: clover_field.h:167
static ColorSpinorField * Create(const ColorSpinorParam &param)
QudaFieldLocation location
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
QudaPrecision Precision() const
QudaFieldLocation Location() const
QudaPrecision precision
const int * X() const
void checkField(const LatticeField &a) const
virtual void copy_to_buffer(void *buffer) const
Copy all contents of the field to a host buffer.
cpuCloverField(const CloverFieldParam &param)
virtual void copy_from_buffer(void *buffer)
Copy all contents of the field from a host buffer to this field.
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
void loadCPUField(const cpuCloverField &cpu)
cudaCloverField(const CloverFieldParam &param)
virtual void copy_from_buffer(void *buffer)
Copy all contents of the field from a host buffer to this field.
void saveCPUField(cpuCloverField &cpu) const
virtual void copy_to_buffer(void *buffer) const
Copy all contents of the field to a host buffer.
void prefetch(QudaFieldLocation mem_space, qudaStream_t stream=0) const
If managed memory and prefetch is enabled, prefetch the clover, the norm field (as appropriate),...
int V
Definition: host_utils.cpp:37
void * memset(void *s, int c, size_t n)
QudaParity parity
Definition: covdev_test.cpp:40
const int nColor
Definition: covdev_test.cpp:44
@ QUDA_QDPJIT_CLOVER_ORDER
Definition: enum_quda.h:259
@ QUDA_PACKED_CLOVER_ORDER
Definition: enum_quda.h:258
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
@ QUDA_FULL_SITE_SUBSET
Definition: enum_quda.h:333
@ QUDA_UKQCD_GAMMA_BASIS
Definition: enum_quda.h:369
@ QUDA_EVEN_PARITY
Definition: enum_quda.h:284
@ QUDA_INVALID_PARITY
Definition: enum_quda.h:284
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_EVEN_ODD_SITE_ORDER
Definition: enum_quda.h:340
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64
@ QUDA_QUARTER_PRECISION
Definition: enum_quda.h:62
@ QUDA_HALF_PRECISION
Definition: enum_quda.h:63
@ QUDA_FLOAT2_FIELD_ORDER
Definition: enum_quda.h:348
@ QUDA_FLOAT4_FIELD_ORDER
Definition: enum_quda.h:349
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363
@ QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:360
enum QudaParity_s QudaParity
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:172
#define pool_device_malloc(size)
Definition: malloc_quda.h:170
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:173
#define pool_device_free(ptr)
Definition: malloc_quda.h:171
#define host_free(ptr)
Definition: malloc_quda.h:115
double norm1(const ColorSpinorField &b)
unsigned long long bytes
double norm2(const ColorSpinorField &a)
ColorSpinorParam colorSpinorParam(const CloverField &a, bool inverse)
double norm2(const CloverField &a, bool inverse=false)
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:605
void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location, void *Out=0, void *In=0, void *outNorm=0, void *inNorm=0)
This generic function is used for copying the clover field where in the input and output can be in an...
qudaStream_t * stream
double norm1(const CloverField &u, bool inverse=false)
void computeClover(CloverField &clover, const GaugeField &fmunu, double coeff)
Driver for computing the clover field from the field strength tensor.
CloverPrefetchType
Definition: clover_field.h:28
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
bool is_prefetch_enabled()
Definition: malloc.cpp:198
std::ostream & operator<<(std::ostream &output, const CloverFieldParam &param)
QudaGaugeParam param
Definition: pack_test.cpp:18
#define qudaMemPrefetchAsync(ptr, count, mem_space, stream)
Definition: quda_api.h:231
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204
cudaStream_t qudaStream_t
Definition: quda_api.h:9
#define qudaDeviceSynchronize()
Definition: quda_api.h:250
#define ALIGNMENT_ADJUST(n)
Definition: quda_internal.h:42
QudaFieldLocation location
Definition: quda.h:33
QudaPrecision precision
Definition: lattice_field.h:52
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:68
QudaSiteSubset siteSubset
Definition: lattice_field.h:72
#define errorQuda(...)
Definition: util_quda.h:120