QUDA  0.9.0
clover_field.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <math.h>
5 #include <typeinfo>
6 
7 #include <quda_internal.h>
8 #include <clover_field.h>
9 #include <gauge_field.h>
10 #include <color_spinor_field.h>
11 #include <blas_quda.h>
12 
13 namespace quda {
14 
17  direct(false),
18  inverse(false),
19  clover(NULL),
20  norm(NULL),
21  cloverInv(NULL),
22  invNorm(NULL),
23  csw(a.Csw()),
24  twisted(a.Twisted()),
25  mu2(a.Mu2()),
26  rho(a.Rho()),
27  order(a.Order()),
29  {
30  precision = a.Precision();
31  nDim = a.Ndim();
32  pad = a.Pad();
34  for(int dir=0; dir<nDim; ++dir) x[dir] = a.X()[dir];
35  }
36 
38  LatticeField(param), bytes(0), norm_bytes(0), nColor(3), nSpin(4),
39  clover(0), norm(0), cloverInv(0), invNorm(0), csw(param.csw), rho(param.rho),
40  order(param.order), create(param.create), trlog{0, 0}
41  {
42  if (nDim != 4) errorQuda("Number of dimensions must be 4, not %d", nDim);
43 
44  if (order == QUDA_QDPJIT_CLOVER_ORDER && create != QUDA_REFERENCE_FIELD_CREATE)
45  errorQuda("QDPJIT ordered clover fields only supported for reference fields");
46 
47  real_length = 2*volumeCB*nColor*nColor*nSpin*nSpin/2; // block-diagonal Hermitian (72 reals)
48  length = 2*stride*nColor*nColor*nSpin*nSpin/2;
49 
50  bytes = (size_t)length*precision;
51  if (isNative()) bytes = 2*ALIGNMENT_ADJUST(bytes/2);
52  if (precision == QUDA_HALF_PRECISION) {
53  norm_bytes = sizeof(float)*2*stride*2; // 2 chirality
54  if (isNative()) norm_bytes = 2*ALIGNMENT_ADJUST(norm_bytes/2);
55  }
56 //for twisted mass only:
57  twisted = false;//param.twisted;
58  mu2 = 0.0; //param.mu2;
59  }
60 
62 
63  bool CloverField::isNative() const {
65  if (order == QUDA_FLOAT2_CLOVER_ORDER) return true;
66  } else if (precision == QUDA_SINGLE_PRECISION ||
68  if (order == QUDA_FLOAT4_CLOVER_ORDER) return true;
69  }
70  return false;
71  }
72 
73  void CloverField::setRho(double rho_)
74  {
75  rho = rho_;
76  }
77 
79 
81  errorQuda("Create type %d not supported", create);
82 
83  if (param.direct) {
85  clover = bytes ? pool_device_malloc(bytes) : nullptr;
87  } else {
88  clover = param.clover;
89  norm = param.norm;
90  }
91 
92  even = clover;
93  odd = static_cast<char*>(clover) + bytes/2;
94 
95  evenNorm = norm;
96  oddNorm = static_cast<char*>(norm) + norm_bytes/2;
97 
99 
100  // this is a hack to prevent us allocating a texture object for an unallocated inverse field
101  if (!param.inverse) {
102  cloverInv = clover;
103  evenInv = even;
104  oddInv = odd;
105  invNorm = norm;
108  }
109  }
110 
111  if (param.inverse) {
113  cloverInv = bytes ? pool_device_malloc(bytes) : nullptr;
115  } else {
116  cloverInv = param.cloverInv;
117  invNorm = param.invNorm;
118  }
119 
120  evenInv = cloverInv;
121  oddInv = static_cast<char*>(cloverInv) + bytes/2;
122 
124  oddInvNorm = static_cast<char*>(invNorm) + norm_bytes/2;
125 
127 
128  // this is a hack to ensure that we can autotune the clover
129  // operator when just using symmetric preconditioning
130  if (!param.direct) {
131  clover = cloverInv;
132  even = evenInv;
133  odd = oddInv;
134  norm = invNorm;
137  }
138  }
139 
140  if (!param.inverse) {
141  cloverInv = clover;
142  evenInv = even;
143  oddInv = odd;
144  invNorm = norm;
147  }
148 
149 #ifdef USE_TEXTURE_OBJECTS
150  createTexObject(tex, normTex, clover, norm, true);
151  createTexObject(invTex, invNormTex, cloverInv, invNorm, true);
152 
153  createTexObject(evenTex, evenNormTex, even, evenNorm, false);
154  createTexObject(oddTex, oddNormTex, odd, oddNorm, false);
155 
156  createTexObject(evenInvTex, evenInvNormTex, evenInv, evenInvNorm, false);
157  createTexObject(oddInvTex, oddInvNormTex, oddInv, oddInvNorm, false);
158 #endif
159  twisted = param.twisted;
160  mu2 = param.mu2;
161 
162  }
163 
164 #ifdef USE_TEXTURE_OBJECTS
165  void cudaCloverField::createTexObject(cudaTextureObject_t &tex, cudaTextureObject_t &texNorm,
166  void *field, void *norm, bool full) {
167  if (isNative()) {
168  // create the texture for the field components
169 
170  cudaChannelFormatDesc desc;
171  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
172  if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
173  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
174 
175  // always four components regardless of precision
176  desc.x = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
177  desc.y = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
178  desc.z = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
179  desc.w = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
180  int texel_size = 4 * (precision == QUDA_DOUBLE_PRECISION ? sizeof(int) : precision);
181 
182  cudaResourceDesc resDesc;
183  memset(&resDesc, 0, sizeof(resDesc));
184  resDesc.resType = cudaResourceTypeLinear;
185  resDesc.res.linear.devPtr = field;
186  resDesc.res.linear.desc = desc;
187  resDesc.res.linear.sizeInBytes = bytes/(!full ? 2 : 1);
188 
189  unsigned long texels = resDesc.res.linear.sizeInBytes / texel_size;
190  if (texels > (unsigned)deviceProp.maxTexture1DLinear) {
191  errorQuda("Attempting to bind too large a texture %lu > %d", texels, deviceProp.maxTexture1DLinear);
192  }
193 
194  cudaTextureDesc texDesc;
195  memset(&texDesc, 0, sizeof(texDesc));
196  if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
197  else texDesc.readMode = cudaReadModeElementType;
198 
199  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
200  checkCudaError();
201 
202  // create the texture for the norm components
204  cudaChannelFormatDesc desc;
205  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
206  desc.f = cudaChannelFormatKindFloat;
207  desc.x = 8*QUDA_SINGLE_PRECISION; desc.y = 0; desc.z = 0; desc.w = 0;
208 
209  cudaResourceDesc resDesc;
210  memset(&resDesc, 0, sizeof(resDesc));
211  resDesc.resType = cudaResourceTypeLinear;
212  resDesc.res.linear.devPtr = norm;
213  resDesc.res.linear.desc = desc;
214  resDesc.res.linear.sizeInBytes = norm_bytes/(!full ? 2 : 1);
215 
216  cudaTextureDesc texDesc;
217  memset(&texDesc, 0, sizeof(texDesc));
218  texDesc.readMode = cudaReadModeElementType;
219 
220  cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL);
221  checkCudaError();
222  }
223  }
224 
225  }
226 
227  void cudaCloverField::destroyTexObject() {
228  if (isNative()) {
229  cudaDestroyTextureObject(tex);
230  cudaDestroyTextureObject(invTex);
231  cudaDestroyTextureObject(evenTex);
232  cudaDestroyTextureObject(oddTex);
233  cudaDestroyTextureObject(evenInvTex);
234  cudaDestroyTextureObject(oddInvTex);
236  cudaDestroyTextureObject(normTex);
237  cudaDestroyTextureObject(invNormTex);
238  cudaDestroyTextureObject(evenNormTex);
239  cudaDestroyTextureObject(oddNormTex);
240  cudaDestroyTextureObject(evenInvNormTex);
241  cudaDestroyTextureObject(oddInvNormTex);
242  }
243  checkCudaError();
244  }
245  }
246 #endif
247 
249  {
250 #ifdef USE_TEXTURE_OBJECTS
251  destroyTexObject();
252 #endif
253 
255  if (clover != cloverInv) {
257  if (norm) pool_device_free(norm);
258  }
261  }
262 
263  checkCudaError();
264  }
265 
266  void cudaCloverField::copy(const CloverField &src, bool inverse) {
267 
268  checkField(src);
269 
270  if (typeid(src) == typeid(cudaCloverField)) {
271  if (src.V(false)) copyGenericClover(*this, src, false, QUDA_CUDA_FIELD_LOCATION);
272  if (src.V(true)) copyGenericClover(*this, src, true, QUDA_CUDA_FIELD_LOCATION);
273  } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION && typeid(src) == typeid(cpuCloverField)) {
274  void *packClover = pool_pinned_malloc(bytes + norm_bytes);
275  void *packCloverNorm = (precision == QUDA_HALF_PRECISION) ? static_cast<char*>(packClover) + bytes : 0;
276 
277  if (src.V(false)) {
278  copyGenericClover(*this, src, false, QUDA_CPU_FIELD_LOCATION, packClover, 0, packCloverNorm, 0);
279  qudaMemcpy(clover, packClover, bytes, cudaMemcpyHostToDevice);
281  qudaMemcpy(norm, packCloverNorm, norm_bytes, cudaMemcpyHostToDevice);
282  }
283 
284  if (src.V(true) && inverse) {
285  copyGenericClover(*this, src, true, QUDA_CPU_FIELD_LOCATION, packClover, 0, packCloverNorm, 0);
286  qudaMemcpy(cloverInv, packClover, bytes, cudaMemcpyHostToDevice);
288  qudaMemcpy(invNorm, packCloverNorm, norm_bytes, cudaMemcpyHostToDevice);
289  }
290 
291  pool_pinned_free(packClover);
292  } else if (reorder_location() == QUDA_CUDA_FIELD_LOCATION && typeid(src) == typeid(cpuCloverField)) {
293  void *packClover = pool_device_malloc(src.Bytes() + src.NormBytes());
294  void *packCloverNorm = (precision == QUDA_HALF_PRECISION) ? static_cast<char*>(packClover) + src.Bytes() : 0;
295 
296  if (src.V(false)) {
297  qudaMemcpy(packClover, src.V(false), src.Bytes(), cudaMemcpyHostToDevice);
299  qudaMemcpy(packCloverNorm, src.Norm(false), src.NormBytes(), cudaMemcpyHostToDevice);
300 
301  copyGenericClover(*this, src, false, QUDA_CUDA_FIELD_LOCATION, 0, packClover, 0, packCloverNorm);
302  }
303 
304  if (src.V(true) && inverse) {
305  qudaMemcpy(packClover, src.V(true), src.Bytes(), cudaMemcpyHostToDevice);
307  qudaMemcpy(packCloverNorm, src.Norm(true), src.NormBytes(), cudaMemcpyHostToDevice);
308 
309  copyGenericClover(*this, src, true, QUDA_CUDA_FIELD_LOCATION, 0, packClover, 0, packCloverNorm);
310  }
311 
312  pool_device_free(packClover);
313  } else {
314  errorQuda("Invalid clover field type");
315  }
316 
317  checkCudaError();
318  }
319 
321 
323  checkField(cpu);
324 
325  // we know we are copying from GPU to CPU here, so for now just
326  // assume that reordering is on CPU
327  void *packClover = pool_pinned_malloc(bytes + norm_bytes);
328  void *packCloverNorm = (precision == QUDA_HALF_PRECISION) ? static_cast<char*>(packClover) + bytes : 0;
329 
330  // first copy over the direct part (if it exists)
331  if (V(false) && cpu.V(false)) {
332  qudaMemcpy(packClover, clover, bytes, cudaMemcpyDeviceToHost);
334  qudaMemcpy(packCloverNorm, norm, norm_bytes, cudaMemcpyDeviceToHost);
335  copyGenericClover(cpu, *this, false, QUDA_CPU_FIELD_LOCATION, 0, packClover, 0, packCloverNorm);
336  } else if((V(false) && !cpu.V(false)) || (!V(false) && cpu.V(false))) {
337  errorQuda("Mismatch between Clover field GPU V(false) and CPU.V(false)");
338  }
339 
340  // now copy the inverse part (if it exists)
341  if (V(true) && cpu.V(true)) {
342  qudaMemcpy(packClover, cloverInv, bytes, cudaMemcpyDeviceToHost);
344  qudaMemcpy(packCloverNorm, invNorm, norm_bytes, cudaMemcpyDeviceToHost);
345  copyGenericClover(cpu, *this, true, QUDA_CPU_FIELD_LOCATION, 0, packClover, 0, packCloverNorm);
346  } else if ((V(true) && !cpu.V(true)) || (!V(true) && cpu.V(true))) {
347  errorQuda("Mismatch between Clover field GPU V(true) and CPU.V(true)");
348  }
349 
350  pool_pinned_free(packClover);
351  }
352 
357 
358  if (gauge.Precision() != precision)
359  errorQuda("Gauge and clover precisions must match");
360 
361  computeClover(*this, gauge, 1.0, QUDA_CUDA_FIELD_LOCATION);
362 
363  }
364 
366 
368  if(order != QUDA_PACKED_CLOVER_ORDER) {errorQuda("cpuCloverField only supports QUDA_PACKED_CLOVER_ORDER");}
369  clover = (void *) safe_malloc(bytes);
371  if(param.inverse) {
372  cloverInv = (void *) safe_malloc(bytes);
374  }
375 
377  memset(clover, '\0', bytes);
378  if(param.inverse) memset(cloverInv, '\0', bytes);
380  if(param.inverse && precision ==QUDA_HALF_PRECISION) memset(invNorm, '\0', norm_bytes);
381  }
382  } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
383  clover = param.clover;
384  norm = param.norm;
385  cloverInv = param.cloverInv;
386  invNorm = param.invNorm;
387  } else {
388  errorQuda("Create type %d not supported", create);
389  }
390 
391  if (param.pad != 0) errorQuda("%s pad must be zero", __func__);
392  }
393 
396  if (clover) host_free(clover);
397  if (norm) host_free(norm);
399  if (invNorm) host_free(invNorm);
400  }
401  }
402 
403  // This doesn't really live here, but is fine for the moment
404  std::ostream& operator<<(std::ostream& output, const CloverFieldParam& param)
405  {
406  output << static_cast<const LatticeFieldParam&>(param);
407  output << "direct = " << param.direct << std::endl;
408  output << "inverse = " << param.inverse << std::endl;
409  output << "clover = " << param.clover << std::endl;
410  output << "norm = " << param.norm << std::endl;
411  output << "cloverInv = " << param.cloverInv << std::endl;
412  output << "invNorm = " << param.invNorm << std::endl;
413  output << "csw = " << param.csw << std::endl;
414  output << "twisted = " << param.twisted << std::endl;
415  output << "mu2 = " << param.mu2 << std::endl;
416  output << "rho = " << param.rho << std::endl;
417  output << "order = " << param.order << std::endl;
418  output << "create = " << param.create << std::endl;
419  return output; // for multiple << operators.
420  }
421 
423 
424  if (a.Precision() == QUDA_HALF_PRECISION)
425  errorQuda("Casting a CloverField into ColorSpinorField not possible in half precision");
426 
427  ColorSpinorParam spinor_param;
428  // 72 = 9 * 4 * 2
429  spinor_param.nColor = 9;
430  spinor_param.nSpin = 4;
431  spinor_param.nDim = a.Ndim();
432  for (int d=0; d<a.Ndim(); d++) spinor_param.x[d] = a.X()[d];
433  spinor_param.precision = a.Precision();
434  spinor_param.pad = a.Pad();
435  spinor_param.siteSubset = QUDA_FULL_SITE_SUBSET;
436  spinor_param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
437  spinor_param.fieldOrder = a.Precision() == QUDA_DOUBLE_PRECISION ?
439  spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
440  spinor_param.create = QUDA_REFERENCE_FIELD_CREATE;
441  spinor_param.v = (void*)a.V(inverse);
442  spinor_param.location = a.Location();
443  return spinor_param;
444  }
445 
446  // Return the L2 norm squared of the clover field
447  double norm2(const CloverField &a, bool inverse) {
449  double nrm2 = blas::norm2(*b);
450  delete b;
451  return nrm2;
452  }
453 
454  // Return the L1 norm of the clover field
455  double norm1(const CloverField &a, bool inverse) {
457  double nrm1 = blas::norm1(*b);
458  delete b;
459  return nrm1;
460  }
461 
462 } // namespace quda
virtual ~CloverField()
void setRho(double rho)
Bakes in the rho factor into the clover field, (for real diagonal additive Hasenbusch), e.g., A + rho.
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
cudaCloverField(const CloverFieldParam &param)
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:116
cudaDeviceProp deviceProp
void * V(bool inverse=false)
Definition: clover_field.h:73
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896
const void * src
#define errorQuda(...)
Definition: util_quda.h:90
double norm2(const ColorSpinorField &a)
Definition: reduce_quda.cu:241
void saveCPUField(cpuCloverField &cpu) const
#define host_free(ptr)
Definition: malloc_quda.h:59
void loadCPUField(const cpuCloverField &cpu)
static ColorSpinorField * Create(const ColorSpinorParam &param)
QudaPrecision precision
Definition: lattice_field.h:54
QudaCloverFieldOrder order
Definition: clover_field.h:64
CloverField(const CloverFieldParam &param)
QudaSiteSubset siteSubset
Definition: lattice_field.h:55
QudaFieldCreate create
Definition: clover_field.h:65
std::ostream & operator<<(std::ostream &output, const CloverFieldParam &param)
double norm2(const CloverField &a, bool inverse=false)
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:50
double norm1(const CloverField &u, bool inverse=false)
QudaFieldLocation location
void checkField(const LatticeField &a) const
const int nColor
Definition: covdev_test.cpp:77
#define ALIGNMENT_ADJUST(n)
Definition: quda_internal.h:32
void compute(const cudaGaugeField &gauge)
#define pool_device_malloc(size)
Definition: malloc_quda.h:113
long unsigned int size_t
cpuCloverField(const CloverFieldParam &param)
bool isNative() const
#define safe_malloc(size)
Definition: malloc_quda.h:54
void copy(const CloverField &src, bool inverse=true)
Copy into this CloverField from the generic CloverField src.
void * memset(void *__b, int __c, size_t __len)
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:115
double norm1(const ColorSpinorField &b)
Definition: reduce_quda.cu:200
static __inline__ dim3 dim3 void size_t cudaStream_t int enum cudaTextureReadMode readMode static __inline__ const struct texture< T, dim, readMode > & tex
ColorSpinorParam colorSpinorParam(const CloverField &a, bool inverse)
void size_t length
#define pool_device_free(ptr)
Definition: malloc_quda.h:114
#define checkCudaError()
Definition: util_quda.h:129
const struct cudaChannelFormatDesc * desc
static __inline__ size_t size_t d
QudaPrecision Precision() const
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:204
QudaPrecision precision
#define a
unsigned long long bytes
Definition: blas_quda.cu:43
void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location, void *Out=0, void *In=0, void *outNorm=0, void *inNorm=0)
This generic function is used for copying the clover field where in the input and output can be in an...
Definition: copy_clover.cu:175