QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
clover_field.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <math.h>
5 
6 #include <quda_internal.h>
7 #include <clover_field.h>
8 #include <gauge_field.h>
9 
10 namespace quda {
11 
13  LatticeField(param), bytes(0), norm_bytes(0), nColor(3), nSpin(4)
14  {
15  if (nDim != 4) errorQuda("Number of dimensions must be 4, not %d", nDim);
16 
17  real_length = 2*volumeCB*nColor*nColor*nSpin*nSpin/2; // block-diagonal Hermitian (72 reals)
19 
22  if (precision == QUDA_HALF_PRECISION) {
23  norm_bytes = sizeof(float)*2*stride*2; // 2 chirality
25  }
26 
28  }
29 
31 
32  }
33 
34  cudaCloverField::cudaCloverField(const void *h_clov, const void *h_clov_inv,
35  const QudaPrecision cpu_prec,
36  const QudaCloverFieldOrder cpu_order,
37  const CloverFieldParam &param)
38  : CloverField(param), clover(0), norm(0), cloverInv(0), invNorm(0)
39  {
40  if (h_clov) {
41  clover = device_malloc(bytes);
43  norm = device_malloc(norm_bytes);
44  }
45 
46  even = clover;
47  odd = (char*)clover + bytes/2;
48 
49  evenNorm = norm;
50  oddNorm = (char*)norm + norm_bytes/2;
51 
52  loadCPUField(clover, norm, h_clov, cpu_prec, cpu_order);
53  }
54 
55  if (h_clov_inv) {
56  cloverInv = device_malloc(bytes);
58  invNorm = device_malloc(bytes);
59  }
60 
61  evenInv = cloverInv;
62  oddInv = (char*)cloverInv + bytes/2;
63 
64  evenInvNorm = invNorm;
65  oddInvNorm = (char*)invNorm + norm_bytes/2;
66 
68 
69  loadCPUField(cloverInv, invNorm, h_clov_inv, cpu_prec, cpu_order);
70 
71  // this is a hack to ensure that we can autotune the clover
72  // operator when just using symmetric preconditioning
73  if (!clover) {
74  clover = cloverInv;
75  even = evenInv;
76  odd = oddInv;
77  }
78  if (!norm) {
79  norm = invNorm;
80  evenNorm = evenInvNorm;
81  oddNorm = oddInvNorm;
82  }
83  }
84 
85 #ifdef USE_TEXTURE_OBJECTS
86  createTexObject(evenTex, evenNormTex, even, evenNorm);
87  createTexObject(oddTex, oddNormTex, odd, oddNorm);
88  createTexObject(evenInvTex, evenInvNormTex, evenInv, evenInvNorm);
89  createTexObject(oddInvTex, oddInvNormTex, oddInv, oddInvNorm);
90 #endif
91 
92  }
93 
94 #ifdef USE_TEXTURE_OBJECTS
95  void cudaCloverField::createTexObject(cudaTextureObject_t &tex, cudaTextureObject_t &texNorm,
96  void *field, void *norm) {
97 
98  // create the texture for the field components
99 
100  cudaChannelFormatDesc desc;
101  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
102  if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
103  else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
104 
105  // always four components regardless of precision
106  desc.x = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
107  desc.y = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
108  desc.z = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
109  desc.w = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
110 
111  cudaResourceDesc resDesc;
112  memset(&resDesc, 0, sizeof(resDesc));
113  resDesc.resType = cudaResourceTypeLinear;
114  resDesc.res.linear.devPtr = field;
115  resDesc.res.linear.desc = desc;
116  resDesc.res.linear.sizeInBytes = bytes/2;
117 
118  cudaTextureDesc texDesc;
119  memset(&texDesc, 0, sizeof(texDesc));
120  if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
121  else texDesc.readMode = cudaReadModeElementType;
122 
123  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
124  checkCudaError();
125 
126  // create the texture for the norm components
128  cudaChannelFormatDesc desc;
129  memset(&desc, 0, sizeof(cudaChannelFormatDesc));
130  desc.f = cudaChannelFormatKindFloat;
131  desc.x = 8*QUDA_SINGLE_PRECISION; desc.y = 0; desc.z = 0; desc.w = 0;
132 
133  cudaResourceDesc resDesc;
134  memset(&resDesc, 0, sizeof(resDesc));
135  resDesc.resType = cudaResourceTypeLinear;
136  resDesc.res.linear.devPtr = norm;
137  resDesc.res.linear.desc = desc;
138  resDesc.res.linear.sizeInBytes = norm_bytes/2;
139 
140  cudaTextureDesc texDesc;
141  memset(&texDesc, 0, sizeof(texDesc));
142  texDesc.readMode = cudaReadModeElementType;
143 
144  cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL);
145  checkCudaError();
146  }
147 
148  }
149 
150  void cudaCloverField::destroyTexObject() {
151  cudaDestroyTextureObject(evenTex);
152  cudaDestroyTextureObject(oddTex);
153  cudaDestroyTextureObject(evenInvTex);
154  cudaDestroyTextureObject(oddInvTex);
156  cudaDestroyTextureObject(evenNormTex);
157  cudaDestroyTextureObject(evenNormTex);
158  cudaDestroyTextureObject(evenInvNormTex);
159  cudaDestroyTextureObject(evenInvNormTex);
160  }
161  checkCudaError();
162  }
163 #endif
164 
166  {
167 #ifdef USE_TEXTURE_OBJECTS
168  destroyTexObject();
169 #endif
170 
171  if (clover != cloverInv) {
172  if (clover) device_free(clover);
173  if (norm) device_free(norm);
174  }
175  if (cloverInv) device_free(cloverInv);
176  if (invNorm) device_free(invNorm);
177 
178  checkCudaError();
179  }
180 
181  template <bool bqcd, typename Float>
182  static inline void packCloverMatrix(float4* a, Float *b, int Vh)
183  {
184  const Float half = bqcd ? 1.0 : 0.5; // pre-include factor of 1/2 introduced by basis change
185 
186  for (int i=0; i<18; i++) {
187  a[i*Vh].x = half * b[4*i+0];
188  a[i*Vh].y = half * b[4*i+1];
189  a[i*Vh].z = half * b[4*i+2];
190  a[i*Vh].w = half * b[4*i+3];
191  }
192  }
193 
194  template <bool bqcd, typename Float>
195  static inline void packCloverMatrix(double2* a, Float *b, int Vh)
196  {
197  const Float half = bqcd ? 1.0 : 0.5; // pre-include factor of 1/2 introduced by basis change
198 
199  for (int i=0; i<36; i++) {
200  a[i*Vh].x = half * b[2*i+0];
201  a[i*Vh].y = half * b[2*i+1];
202  }
203  }
204 
213  template <typename Float>
214  static inline void reorderBQCD(Float *quda, Float *bqcd) {
215 
216  /* int bq[36] = { 0, 1, 20, 21, 32, 33, // diagonal
217  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // column 1
218  12, 13, 14, 15, 16, 17, 18, 19, // column 2
219  22, 23, 24, 25, 26, 27, // column 3
220  28, 29, 30, 31, // column 4
221  34, 35}; */
222 
223  int bq[36] = { 21, 32, 33, 0, 1, 20, // diagonal
224  28, 29, 30, 31, 6, 7, 14, 15, 22, 23, // column 1 6
225  34, 35, 8, 9, 16, 17, 24, 25, // column 2 16
226  10, 11, 18, 19, 26, 27, // column 3 24
227  2, 3, 4, 5, // column 4 30
228  12, 13};
229 
230  // flip the sign of the imaginary components
231  int sign[36];
232  for (int i=0; i<6; i++) sign[i] = 1;
233  for (int i=6; i<36; i+=2) {
234  if ( (i >= 10 && i<= 15) || (i >= 18 && i <= 29) ) {
235  sign[i] = -1; sign[i+1] = -1;
236  } else {
237  sign[i] = 1; sign[i+1] = -1;
238  }
239  }
240 
241  // first chiral block
242  for (int i=0; i<36; i++) quda[i] = sign[i] * bqcd[bq[i]];
243 
244  // second chiral block
245  for (int i=0; i<36; i++) quda[i+36] = sign[i] * bqcd[bq[i]+36];
246  }
247 
248  template <typename Float, typename FloatN>
249  static void packParityClover(FloatN *res, Float *clover, int Vh, int pad,
250  const QudaCloverFieldOrder cpu_order)
251  {
252  if (cpu_order == QUDA_PACKED_CLOVER_ORDER) {
253  for (int i = 0; i < Vh; i++) {
254  packCloverMatrix<false>(res+i, clover+72*i, Vh+pad);
255  }
256  } else { // must be doing BQCD order
257  for (int i = 0; i < Vh; i++) {
258  Float tmp[72];
259  reorderBQCD(tmp, clover+72*i);
260  packCloverMatrix<true>(res+i, tmp, Vh+pad);
261  }
262  }
263  }
264 
265  template <typename Float, typename FloatN>
266  static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X, int pad)
267  {
268  int Vh = X[0]*X[1]*X[2]*X[3]/2;
269 
270  for (int i=0; i<Vh; i++) {
271 
272  int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
273 
274  { // even sites
275  int k = 2*i + boundaryCrossings%2;
276  packCloverMatrix<false>(even+i, clover+72*k, Vh+pad);
277  }
278 
279  { // odd sites
280  int k = 2*i + (boundaryCrossings+1)%2;
281  packCloverMatrix<false>(odd+i, clover+72*k, Vh+pad);
282  }
283  }
284  }
285 
286  template<bool bqcd, typename Float>
287  static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh)
288  {
289  const Float half = bqcd ? 1.0 : 0.5; // pre-include factor of 1/2 introduced by basis change
290  Float max, a, c;
291 
292  // treat the two chiral blocks separately
293  for (int chi=0; chi<2; chi++) {
294  max = fabs(clover[0]);
295  for (int i=1; i<36; i++) {
296  if ((a = fabs(clover[i])) > max) max = a;
297  }
298  c = MAX_SHORT/max;
299  for (int i=0; i<9; i++) {
300  res[i*Vh].x = (short) (c * clover[4*i+0]);
301  res[i*Vh].y = (short) (c * clover[4*i+1]);
302  res[i*Vh].z = (short) (c * clover[4*i+2]);
303  res[i*Vh].w = (short) (c * clover[4*i+3]);
304  }
305  norm[chi*Vh] = half*max;
306  res += 9*Vh;
307  clover += 36;
308  }
309  }
310 
311  template <typename Float>
312  static void packParityCloverHalf(short4 *res, float *norm, Float *clover,
313  int Vh, int pad, const CloverFieldOrder cpu_order)
314  {
315  if (cpu_order == QUDA_PACKED_CLOVER_ORDER) {
316  for (int i = 0; i < Vh; i++) {
317  packCloverMatrixHalf<false>(res+i, norm+i, clover+72*i, Vh+pad);
318  }
319  } else { // must be doing BQCD order
320  for (int i = 0; i < Vh; i++) {
321  Float tmp[72];
322  reorderBQCD(tmp, clover+72*i);
323  packCloverMatrixHalf<true>(res+i, norm+i, tmp, Vh+pad);
324  }
325  }
326  }
327 
328  template <typename Float>
329  static void packFullCloverHalf(short4 *even, float *evenNorm, short4 *odd, float *oddNorm,
330  Float *clover, int *X, int pad)
331  {
332  int Vh = X[0]*X[1]*X[2]*X[3]/2;
333 
334  for (int i=0; i<Vh; i++) {
335 
336  int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
337 
338  { // even sites
339  int k = 2*i + boundaryCrossings%2;
340  packCloverMatrixHalf<false>(even+i, evenNorm+i, clover+72*k, Vh+pad);
341  }
342 
343  { // odd sites
344  int k = 2*i + (boundaryCrossings+1)%2;
345  packCloverMatrixHalf<false>(odd+i, oddNorm+i, clover+72*k, Vh+pad);
346  }
347  }
348  }
349 
350  void cudaCloverField::loadCPUField(void *clover, void *norm, const void *h_clover,
351  const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order) {
352 
353  void *h_clover_odd = (char*)h_clover + cpu_prec*real_length/2;
354 
355  if (cpu_order == QUDA_LEX_PACKED_CLOVER_ORDER) {
356  loadFullField(clover, norm, (char*)clover+bytes/2, (char*)norm+norm_bytes/2, h_clover, cpu_prec, cpu_order);
357  } else if (cpu_order == QUDA_PACKED_CLOVER_ORDER || cpu_order == QUDA_BQCD_CLOVER_ORDER) {
358  loadParityField(clover, norm, h_clover, cpu_prec, cpu_order);
359  loadParityField((char*)clover+bytes/2, (char*)norm+norm_bytes/2, h_clover_odd, cpu_prec, cpu_order);
360  } else if(cpu_order == QUDA_INTERNAL_CLOVER_ORDER) { // No reordering necessary, just a plain copy
361  cudaMemcpy(clover, h_clover, total_bytes, cudaMemcpyHostToDevice);
362  }else{
363  errorQuda("Invalid clover_order");
364  }
365  }
366 
367  void cudaCloverField::loadParityField(void *clover, void *cloverNorm, const void *h_clover,
368  const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order)
369  {
370  // use pinned memory
371  void *packedClover, *packedCloverNorm=0;
372 
373  if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
374  errorQuda("Cannot have CUDA double precision without CPU double precision");
375  }
376  if (cpu_order != QUDA_PACKED_CLOVER_ORDER && cpu_order != QUDA_BQCD_CLOVER_ORDER)
377  errorQuda("Invalid clover order %d", cpu_order);
378 
380  packedClover = bufferPinned;
381  if (precision == QUDA_HALF_PRECISION) packedCloverNorm = (char*)bufferPinned + bytes/2;
382 
384  packParityClover((double2 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order);
385  } else if (precision == QUDA_SINGLE_PRECISION) {
386  if (cpu_prec == QUDA_DOUBLE_PRECISION) {
387  packParityClover((float4 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order);
388  } else {
389  packParityClover((float4 *)packedClover, (float *)h_clover, volumeCB, pad, cpu_order);
390  }
391  } else {
392  if (cpu_prec == QUDA_DOUBLE_PRECISION) {
393  packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm,
394  (double *)h_clover, volumeCB, pad, cpu_order);
395  } else {
396  packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm,
397  (float *)h_clover, volumeCB, pad, cpu_order);
398  }
399  }
400 
401  cudaMemcpy(clover, packedClover, bytes/2, cudaMemcpyHostToDevice);
403  cudaMemcpy(cloverNorm, packedCloverNorm, norm_bytes/2, cudaMemcpyHostToDevice);
404  }
405 
406  void cudaCloverField::loadFullField(void *even, void *evenNorm, void *odd, void *oddNorm,
407  const void *h_clover, const QudaPrecision cpu_prec,
408  const CloverFieldOrder cpu_order)
409  {
410  // use pinned memory
411  void *packedEven, *packedOdd, *packedEvenNorm=0, *packedOddNorm=0;
412 
413  if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
414  errorQuda("Cannot have CUDA double precision without CPU double precision");
415  }
416  if (cpu_order != QUDA_LEX_PACKED_CLOVER_ORDER) {
417  errorQuda("Invalid clover order");
418  }
419 
421  packedEven = bufferPinned;
422  packedOdd = (char*)bufferPinned + bytes/2;
423 
425  packedEvenNorm = (char*)bufferPinned + bytes;
426  packedOddNorm = (char*)bufferPinned + bytes + norm_bytes/2;
427  }
428 
430  packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, x, pad);
431  } else if (precision == QUDA_SINGLE_PRECISION) {
432  if (cpu_prec == QUDA_DOUBLE_PRECISION) {
433  packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, x, pad);
434  } else {
435  packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, x, pad);
436  }
437  } else {
438  if (cpu_prec == QUDA_DOUBLE_PRECISION) {
439  packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd,
440  (float *) packedOddNorm, (double *)clover, x, pad);
441  } else {
442  packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd,
443  (float * )packedOddNorm, (float *)clover, x, pad);
444  }
445  }
446 
447  cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice);
448  cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice);
450  cudaMemcpy(evenNorm, packedEvenNorm, norm_bytes/2, cudaMemcpyHostToDevice);
451  cudaMemcpy(oddNorm, packedOddNorm, norm_bytes/2, cudaMemcpyHostToDevice);
452  }
453  }
454 
455 
459  void cudaCloverField::compute(const cudaGaugeField &gauge) {
460 
461  if (gauge.Precision() != precision)
462  errorQuda("Gauge and clover precisions must match");
463 
464  computeCloverCuda(*this, gauge);
465 
466  }
467 
468 } // namespace quda