18 static int numParams = 18;
20 #define LAUNCH_KERNEL_GAUGEFIX(kernel, tp, stream, arg, parity, ...) \ 21 if (tp.aux.x == 0) { \ 22 switch (tp.block.x) { \ 23 case 256: kernel<0, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 24 case 512: kernel<0, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 25 case 768: kernel<0, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 26 case 1024: kernel<0, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 27 default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \ 29 } else if (tp.aux.x == 1) { \ 30 switch (tp.block.x) { \ 31 case 256: kernel<1, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 32 case 512: kernel<1, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 33 case 768: kernel<1, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 34 case 1024: kernel<1, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 35 default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \ 37 } else if (tp.aux.x == 2) { \ 38 switch (tp.block.x) { \ 39 case 256: kernel<2, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 40 case 512: kernel<2, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 41 case 768: kernel<2, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 42 case 1024: kernel<2, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 43 default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \ 45 } else if (tp.aux.x == 3) { \ 46 switch (tp.block.x) { \ 47 case 128: kernel<3, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 48 case 256: kernel<3, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 49 case 384: kernel<3, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 50 case 512: kernel<3, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 51 case 640: kernel<3, 160, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 52 case 768: kernel<3, 192, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 53 case 896: kernel<3, 224, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 54 case 1024: kernel<3, 256, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 55 default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \ 57 } else if (tp.aux.x == 4) { \ 58 switch (tp.block.x) { \ 59 case 128: kernel<4, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 60 case 256: kernel<4, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 61 case 384: kernel<4, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 62 case 512: kernel<4, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 63 case 640: kernel<4, 160, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 64 case 768: kernel<4, 192, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 65 case 896: kernel<4, 224, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 66 case 1024: kernel<4, 256, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 67 default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \ 69 } else if (tp.aux.x == 5) { \ 70 switch (tp.block.x) { \ 71 case 128: kernel<5, 32, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 72 case 256: kernel<5, 64, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 73 case 384: kernel<5, 96, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 74 case 512: kernel<5, 128, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 75 case 640: kernel<5, 160, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 76 case 768: kernel<5, 192, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 77 case 896: kernel<5, 224, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 78 case 1024: kernel<5, 256, __VA_ARGS__><<<tp.grid.x, tp.block.x, tp.shared_bytes, stream>>>(arg, parity); break; \ 79 default: errorQuda("%s not implemented for %d threads", #kernel, tp.block.x); \ 82 errorQuda("Not implemented for %d", tp.aux.x); \ 88 template <
typename Gauge>
89 struct GaugeFixQualityArg :
public ReduceArg<double2> {
96 GaugeFixQualityArg(
const Gauge &dataOr,
const cudaGaugeField &data)
99 for (
int dir = 0; dir < 4; ++dir ) {
100 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
102 border[dir] = data.
R()[dir];
105 threads = X[0]*X[1]*X[2]*X[3]/2;
107 double getAction(){
return result_h[0].x; }
108 double getTheta(){
return result_h[0].y; }
115 template<
int blockSize,
typename Float,
typename Gauge,
int gauge_dir>
116 __global__
void computeFix_quality(GaugeFixQualityArg<Gauge> argQ){
117 typedef complex<Float> Cmplx;
119 int idx_cb = threadIdx.x + blockIdx.x * blockDim.x;
122 double2 data = make_double2(0.0,0.0);
123 while (idx_cb < argQ.threads) {
126 for (
int dr = 0; dr < 4; ++dr ) X[dr] = argQ.X[dr];
132 for (
int dr = 0; dr < 4; ++dr ) {
133 x[dr] += argQ.border[dr];
134 X[dr] += 2 * argQ.border[dr];
140 for (
int mu = 0;
mu < gauge_dir;
mu++ ) {
145 data.x += -delta(0, 0).x - delta(1, 1).x - delta(2, 2).x;
148 for (
int mu = 0;
mu < gauge_dir;
mu++ ) {
153 delta -=
conj(delta);
161 idx_cb += blockDim.x * gridDim.x;
163 reduce2d<blockSize,2>(argQ, data);
170 template<
typename Float,
typename Gauge,
int gauge_dir>
172 GaugeFixQualityArg<Gauge> argQ;
173 mutable char aux_string[128];
176 bool tuneGridDim()
const {
return true; }
179 GaugeFixQuality(GaugeFixQualityArg<Gauge> &argQ) : argQ(argQ) { }
180 ~GaugeFixQuality () { }
182 void apply(
const cudaStream_t &
stream){
184 argQ.result_h[0] = make_double2(0.0,0.0);
188 argQ.result_h[0].x /= (double)(3 * gauge_dir * 2 * argQ.threads *
comm_size());
189 argQ.result_h[0].y /= (double)(3 * 2 * argQ.threads *
comm_size());
193 std::stringstream vol;
194 vol << argQ.X[0] <<
"x";
195 vol << argQ.X[1] <<
"x";
196 vol << argQ.X[2] <<
"x";
198 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d",argQ.threads,
sizeof(Float),gauge_dir);
199 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
203 long long flops()
const {
204 return (36LL * gauge_dir + 65LL) * 2 * argQ.threads;
207 long long bytes()
const {
208 return 2LL * gauge_dir * 2 * argQ.threads * numParams *
sizeof(Float);
217 template <
typename Float,
typename Gauge>
226 const Float relax_boost;
228 GaugeFixArg(Gauge & dataOr,
cudaGaugeField & data,
const Float relax_boost)
229 : dataOr(dataOr), data(data), relax_boost(relax_boost) {
231 for (
int dir = 0; dir < 4; ++dir ) {
232 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
234 border[dir] = data.
R()[dir];
237 threads = X[0] * X[1] * X[2] * X[3] >> 1;
247 template<
int ImplementationType,
int blockSize,
typename Float,
typename Gauge,
int gauge_dir>
248 __global__
void computeFix(GaugeFixArg<Float, Gauge>
arg,
int parity){
249 typedef complex<Float> Cmplx;
251 int tid = (threadIdx.x + blockSize) % blockSize;
252 int idx = blockIdx.x * blockSize + tid;
254 if ( idx >= arg.threads )
return;
257 if ( ImplementationType < 3 ) {
260 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
266 for (
int dr = 0; dr < 4; ++dr ) {
267 x[dr] += arg.border[dr];
268 X[dr] += 2 * arg.border[dr];
271 int mu = (threadIdx.x / blockSize);
273 if ( threadIdx.x >= blockSize * 4 ) {
275 x[
mu] = (x[
mu] - 1 + X[
mu]) % X[mu];
278 idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
282 if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
284 if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
287 if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
288 arg.dataOr(mu, idx, oddbit) = link;
294 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
300 for (
int dr = 0; dr < 4; ++dr ) {
301 x[dr] += arg.border[dr];
302 X[dr] += 2 * arg.border[dr];
305 int mu = (threadIdx.x / blockSize);
306 idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
310 x[
mu] = (x[
mu] - 1 + X[
mu]) % X[mu];
311 int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
317 if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
319 if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
322 if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
324 arg.dataOr(mu, idx, parity) = link;
325 arg.dataOr(mu, idx1, 1 - parity) = link1;
334 template<
typename Float,
typename Gauge,
int gauge_dir>
336 GaugeFixArg<Float, Gauge>
arg;
338 mutable char aux_string[128];
342 unsigned int blockx = param.
block.x / 8;
343 if (param.
aux.x > 2) blockx = param.
block.x / 4;
344 unsigned int gx = (arg.threads + blockx - 1) / blockx;
345 return dim3(gx, 1, 1);
348 bool advanceBlockDim (
TuneParam ¶m)
const {
351 const unsigned int min_threads0 = 32 * 8;
352 const unsigned int min_threads1 = 32 * 4;
353 const unsigned int max_threads = 1024;
354 const unsigned int atmadd = 0;
355 unsigned int min_threads = min_threads0;
356 param.
aux.x += atmadd;
357 if (param.
aux.x > 2) min_threads = 32 * 4;
358 param.
block.x += min_threads;
360 param.
grid = createGrid(param);
362 if ((param.
block.x >= min_threads) && (param.
block.x <= max_threads)) {
365 }
else if (param.
aux.x == 0) {
366 param.
block.x = min_threads0;
369 param.
grid = createGrid(param);
372 }
else if (param.
aux.x == 1) {
373 param.
block.x = min_threads0;
376 param.
grid = createGrid(param);
379 }
else if (param.
aux.x == 2) {
380 param.
block.x = min_threads1;
383 param.
grid = createGrid(param);
386 }
else if (param.
aux.x == 3) {
387 param.
block.x = min_threads1;
390 param.
grid = createGrid(param);
393 }
else if (param.
aux.x == 4) {
394 param.
block.x = min_threads1;
397 param.
grid = createGrid(param);
406 unsigned int sharedBytesPerThread()
const {
409 unsigned int sharedBytesPerBlock(
const TuneParam ¶m)
const {
410 switch (param.
aux.x) {
411 case 0:
return param.
block.x * 4 *
sizeof(Float);
412 case 1:
return param.
block.x * 4 *
sizeof(Float) / 8;
413 case 2:
return param.
block.x * 4 *
sizeof(Float) / 8;
414 case 3:
return param.
block.x * 4 *
sizeof(Float);
415 default:
return param.
block.x *
sizeof(Float);
419 bool tuneSharedBytes()
const {
422 bool tuneGridDim()
const {
425 unsigned int minThreads()
const {
430 GaugeFix(GaugeFixArg<Float, Gauge> &arg) : arg(arg), parity(0) { }
433 void setParity(
const int par){
437 void apply(
const cudaStream_t &stream){
439 LAUNCH_KERNEL_GAUGEFIX(computeFix, tp, stream, arg, parity, Float, Gauge, gauge_dir);
442 virtual void initTuneParam(
TuneParam ¶m)
const 444 param.
block = dim3(256, 1, 1);
446 param.
grid = createGrid(param);
450 virtual void defaultTuneParam(
TuneParam ¶m)
const {
451 initTuneParam(param);
455 std::stringstream vol;
456 vol << arg.X[0] <<
"x";
457 vol << arg.X[1] <<
"x";
458 vol << arg.X[2] <<
"x";
460 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d",arg.threads,
sizeof(Float),gauge_dir);
461 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
464 std::string paramString(
const TuneParam ¶m)
const {
466 ps <<
", atomicadd=" << param.
aux.x;
477 long long flops()
const {
478 return 3LL * (22 + 28 * gauge_dir + 224 * 3) * arg.threads;
481 long long bytes()
const {
482 return 8LL * 2 * arg.threads * numParams *
sizeof(Float);
490 template <
typename Float,
typename Gauge>
491 struct GaugeFixInteriorPointsArg {
499 const Float relax_boost;
500 GaugeFixInteriorPointsArg(Gauge & dataOr,
cudaGaugeField & data,
const Float relax_boost)
501 : dataOr(dataOr), data(data), relax_boost(relax_boost) {
504 for (
int dir = 0; dir < 4; ++dir ) {
506 else border[dir] = 0;
508 for (
int dir = 0; dir < 4; ++dir ) X[dir] = data.
X()[dir] - border[dir] * 2;
510 for (
int dir = 0; dir < 4; ++dir ) X[dir] = data.
X()[dir];
512 threads = X[0] * X[1] * X[2] * X[3] >> 1;
522 template<
int ImplementationType,
int blockSize,
typename Float,
typename Gauge,
int gauge_dir>
523 __global__
void computeFixInteriorPoints(GaugeFixInteriorPointsArg<Float, Gauge> arg,
int parity){
524 int tid = (threadIdx.x + blockSize) % blockSize;
525 int idx = blockIdx.x * blockSize + tid;
526 if ( idx >= arg.threads )
return;
527 typedef complex<Float>
Complex;
530 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
533 int za = (idx / (X[0] / 2));
534 int zb = (za / X[1]);
535 x[1] = za - zb * X[1];
537 x[2] = zb - x[3] * X[2];
538 int p = 0;
for (
int dr = 0; dr < 4; ++dr ) p += arg.border[dr];
540 int x1odd = (x[1] + x[2] + x[3] + parity + p) & 1;
542 x[0] = (2 * idx + x1odd) - za * X[0];
543 for (
int dr = 0; dr < 4; ++dr ) {
544 x[dr] += arg.border[dr];
545 X[dr] += 2 * arg.border[dr];
550 int mu = (threadIdx.x / blockSize);
553 if ( ImplementationType < 3 ) {
554 if ( threadIdx.x >= blockSize * 4 ) {
556 x[
mu] = (x[
mu] - 1 + X[
mu]) % X[mu];
559 idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
563 if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
565 if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
568 if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
569 arg.dataOr(mu, idx, parity) = link;
573 idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
577 x[
mu] = (x[
mu] - 1 + X[
mu]) % X[mu];
578 int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
583 if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
585 if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
588 if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
590 arg.dataOr(mu, idx, parity) = link;
591 arg.dataOr(mu, idx1, 1 - parity) = link1;
599 template<
typename Float,
typename Gauge,
int gauge_dir>
600 class GaugeFixInteriorPoints :
Tunable {
601 GaugeFixInteriorPointsArg<Float, Gauge>
arg;
603 mutable char aux_string[128];
605 dim3 createGrid(
const TuneParam ¶m)
const 607 unsigned int blockx = param.
block.x / 8;
608 if (param.
aux.x > 2) blockx = param.
block.x / 4;
609 unsigned int gx = (arg.threads + blockx - 1) / blockx;
610 return dim3(gx, 1, 1);
613 bool advanceBlockDim (
TuneParam ¶m)
const {
616 const unsigned int min_threads0 = 32 * 8;
617 const unsigned int min_threads1 = 32 * 4;
618 const unsigned int max_threads = 1024;
619 const unsigned int atmadd = 0;
620 unsigned int min_threads = min_threads0;
621 param.
aux.x += atmadd;
622 if (param.
aux.x > 2) min_threads = 32 * 4;
623 param.
block.x += min_threads;
625 param.
grid = createGrid(param);
627 if ((param.
block.x >= min_threads) && (param.
block.x <= max_threads)) {
630 }
else if (param.
aux.x == 0) {
631 param.
block.x = min_threads0;
634 param.
grid = createGrid(param);
637 }
else if (param.
aux.x == 1) {
638 param.
block.x = min_threads0;
641 param.
grid = createGrid(param);
644 }
else if (param.
aux.x == 2) {
645 param.
block.x = min_threads1;
648 param.
grid = createGrid(param);
651 }
else if (param.
aux.x == 3) {
652 param.
block.x = min_threads1;
655 param.
grid = createGrid(param);
658 }
else if (param.
aux.x == 4) {
659 param.
block.x = min_threads1;
662 param.
grid = createGrid(param);
671 unsigned int sharedBytesPerThread()
const {
674 unsigned int sharedBytesPerBlock(
const TuneParam ¶m)
const {
675 switch (param.
aux.x) {
676 case 0:
return param.
block.x * 4 *
sizeof(Float);
677 case 1:
return param.
block.x * 4 *
sizeof(Float) / 8;
678 case 2:
return param.
block.x * 4 *
sizeof(Float) / 8;
679 case 3:
return param.
block.x * 4 *
sizeof(Float);
680 default:
return param.
block.x *
sizeof(Float);
684 bool tuneSharedBytes()
const {
687 bool tuneGridDim()
const {
690 unsigned int minThreads()
const {
695 GaugeFixInteriorPoints(GaugeFixInteriorPointsArg<Float, Gauge> &arg) : arg(arg), parity(0) {}
697 ~GaugeFixInteriorPoints () { }
699 void setParity(
const int par) { parity = par; }
701 void apply(
const cudaStream_t &stream)
704 LAUNCH_KERNEL_GAUGEFIX(computeFixInteriorPoints, tp, stream, arg, parity, Float, Gauge, gauge_dir);
707 virtual void initTuneParam(
TuneParam ¶m)
const 709 param.
block = dim3(256, 1, 1);
711 param.
grid = createGrid(param);
715 virtual void defaultTuneParam(
TuneParam ¶m)
const { initTuneParam(param); }
718 std::stringstream vol;
719 vol << arg.X[0] <<
"x";
720 vol << arg.X[1] <<
"x";
721 vol << arg.X[2] <<
"x";
723 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d",arg.threads,
sizeof(Float),gauge_dir);
724 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
727 std::string paramString(
const TuneParam ¶m)
const {
729 ps <<
", atomicadd=" << param.
aux.x;
740 long long flops()
const {
741 return 3LL * (22 + 28 * gauge_dir + 224 * 3) * arg.threads;
744 long long bytes()
const {
745 return 8LL * 2 * arg.threads * numParams *
sizeof(Float);
750 template <
typename Float,
typename Gauge>
751 struct GaugeFixBorderPointsArg {
755 int *borderpoints[2];
756 int *faceindicessize[2];
758 size_t faceVolumeCB[4];
761 const Float relax_boost;
763 GaugeFixBorderPointsArg(Gauge & dataOr,
cudaGaugeField & data,
const Float relax_boost,
size_t faceVolume_[4],
size_t faceVolumeCB_[4])
764 : dataOr(dataOr), data(data), relax_boost(relax_boost) {
767 for (
int dir = 0; dir < 4; ++dir ) {
768 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
769 border[dir] = data.
R()[dir];
777 for (
int dir = 0; dir < 4; ++dir ) {
778 faceVolume[dir] = faceVolume_[dir];
779 faceVolumeCB[dir] = faceVolumeCB_[dir];
781 if (
comm_partitioned() ) PreCalculateLatticeIndices(faceVolume, faceVolumeCB, X, border, threads, borderpoints);
788 template<
int ImplementationType,
int blockSize,
typename Float,
typename Gauge,
int gauge_dir>
789 __global__
void computeFixBorderPoints(GaugeFixBorderPointsArg<Float, Gauge> arg,
int parity){
790 typedef complex<Float> Cmplx;
792 int tid = (threadIdx.x + blockSize) % blockSize;
793 int idx = blockIdx.x * blockSize + tid;
794 if ( idx >= arg.threads )
return;
795 int mu = (threadIdx.x / blockSize);
796 idx = arg.borderpoints[
parity][idx];
798 x[3] = idx / (arg.X[0] * arg.X[1] * arg.X[2]);
799 x[2] = (idx / (arg.X[0] * arg.X[1])) % arg.X[2];
800 x[1] = (idx / arg.X[0]) % arg.X[1];
801 x[0] = idx % arg.X[0];
803 for (
int dr = 0; dr < 4; ++dr ) x[dr] += arg.border[dr];
805 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr] + 2 * arg.border[dr];
808 if ( ImplementationType < 3 ) {
809 if ( threadIdx.x >= blockSize * 4 ) {
811 x[
mu] = (x[
mu] - 1 + X[
mu]) % X[mu];
814 idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
818 if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
820 if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
823 if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, arg.relax_boost, tid);
824 arg.dataOr(mu, idx, parity) = link;
828 idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
832 x[
mu] = (x[
mu] - 1 + X[
mu]) % X[mu];
833 int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
838 if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
840 if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
843 if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1, arg.relax_boost, tid);
845 arg.dataOr(mu, idx, parity) = link;
846 arg.dataOr(mu, idx1, 1 - parity) = link1;
856 template<
typename Float,
typename Gauge,
int gauge_dir>
857 class GaugeFixBorderPoints :
Tunable {
858 GaugeFixBorderPointsArg<Float, Gauge>
arg;
860 mutable char aux_string[128];
862 dim3 createGrid(
const TuneParam ¶m)
const 864 unsigned int blockx = param.
block.x / 8;
865 if (param.
aux.x > 2) blockx = param.
block.x / 4;
866 unsigned int gx = (arg.threads + blockx - 1) / blockx;
867 return dim3(gx, 1, 1);
870 bool advanceBlockDim(
TuneParam ¶m)
const 874 const unsigned int min_threads0 = 32 * 8;
875 const unsigned int min_threads1 = 32 * 4;
876 const unsigned int max_threads = 1024;
877 const unsigned int atmadd = 0;
878 unsigned int min_threads = min_threads0;
879 param.
aux.x += atmadd;
880 if (param.
aux.x > 2) min_threads = 32 * 4;
881 param.
block.x += min_threads;
883 param.
grid = createGrid(param);
885 if ((param.
block.x >= min_threads) && (param.
block.x <= max_threads)) {
888 }
else if (param.
aux.x == 0) {
889 param.
block.x = min_threads0;
892 param.
grid = createGrid(param);
895 }
else if (param.
aux.x == 1) {
896 param.
block.x = min_threads0;
899 param.
grid = createGrid(param);
902 }
else if (param.
aux.x == 2) {
903 param.
block.x = min_threads1;
906 param.
grid = createGrid(param);
909 }
else if (param.
aux.x == 3) {
910 param.
block.x = min_threads1;
913 param.
grid = createGrid(param);
916 }
else if (param.
aux.x == 4) {
917 param.
block.x = min_threads1;
920 param.
grid = createGrid(param);
929 unsigned int sharedBytesPerThread()
const {
932 unsigned int sharedBytesPerBlock(
const TuneParam ¶m)
const {
933 switch (param.
aux.x) {
934 case 0:
return param.
block.x * 4 *
sizeof(Float);
935 case 1:
return param.
block.x * 4 *
sizeof(Float) / 8;
936 case 2:
return param.
block.x * 4 *
sizeof(Float) / 8;
937 case 3:
return param.
block.x * 4 *
sizeof(Float);
938 default:
return param.
block.x *
sizeof(Float);
942 bool tuneSharedBytes()
const {
945 bool tuneGridDim()
const {
948 unsigned int minThreads()
const {
953 GaugeFixBorderPoints(GaugeFixBorderPointsArg<Float, Gauge> &arg) : arg(arg), parity(0) { }
954 ~GaugeFixBorderPoints () {
957 void setParity(
const int par){
961 void apply(
const cudaStream_t &stream){
963 LAUNCH_KERNEL_GAUGEFIX(computeFixBorderPoints, tp, stream, arg, parity, Float, Gauge, gauge_dir);
966 virtual void initTuneParam(
TuneParam ¶m)
const 968 param.
block = dim3(256, 1, 1);
970 param.
grid = createGrid(param);
974 virtual void defaultTuneParam(
TuneParam ¶m)
const {
975 initTuneParam(param);
979 std::stringstream vol;
980 vol << arg.X[0] <<
"x";
981 vol << arg.X[1] <<
"x";
982 vol << arg.X[2] <<
"x";
984 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d",arg.threads,
sizeof(Float),gauge_dir);
985 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
988 std::string paramString(
const TuneParam ¶m)
const {
990 ps <<
", atomicadd=" << param.
aux.x;
1001 long long flops()
const {
1002 return 3LL * (22 + 28 * gauge_dir + 224 * 3) * arg.threads;
1005 long long bytes()
const {
1006 return 8LL * 2 * arg.threads * numParams *
sizeof(Float);
1024 template <
typename Gauge>
1025 struct GaugeFixUnPackArg {
1033 for (
int dir = 0; dir < 4; ++dir ) {
1034 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
1036 border[dir] = data.
R()[dir];
1043 template<
int NElems,
typename Float,
typename Gauge,
bool pack>
1044 __global__
void Kernel_UnPackGhost(
int size, GaugeFixUnPackArg<Gauge> arg, complex<Float> *array,
int parity,
int face,
int dir){
1045 int idx = blockIdx.x * blockDim.x + threadIdx.x;
1046 if ( idx >= size )
return;
1048 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
1052 parity = 1 - parity;
1055 za = idx / ( X[1] / 2);
1057 x[2] = za - x[3] * X[2];
1059 xodd = (borderid + x[2] + x[3] +
parity) & 1;
1060 x[1] = (2 * idx + xodd) - za * X[1];
1063 za = idx / ( X[0] / 2);
1065 x[2] = za - x[3] * X[2];
1067 xodd = (borderid + x[2] + x[3] +
parity) & 1;
1068 x[0] = (2 * idx + xodd) - za * X[0];
1071 za = idx / ( X[0] / 2);
1073 x[1] = za - x[3] * X[1];
1075 xodd = (borderid + x[1] + x[3] +
parity) & 1;
1076 x[0] = (2 * idx + xodd) - za * X[0];
1079 za = idx / ( X[0] / 2);
1081 x[1] = za - x[2] * X[1];
1083 xodd = (borderid + x[1] + x[2] +
parity) & 1;
1084 x[0] = (2 * idx + xodd) - za * X[0];
1087 for (
int dr = 0; dr < 4; ++dr ) {
1088 x[dr] += arg.border[dr];
1089 X[dr] += 2 * arg.border[dr];
1093 int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
1094 typedef complex<Float> Cmplx;
1096 RegType
tmp[NElems];
1099 arg.dataOr.load(data,
id, dir, parity);
1100 arg.dataOr.reconstruct.Pack(tmp, data,
id);
1101 for (
int i = 0; i < NElems / 2; ++i ) {
1102 array[idx + size * i] = Cmplx(tmp[2*i+0], tmp[2*i+1]);
1105 for (
int i = 0; i < NElems / 2; ++i ) {
1106 tmp[2*i+0] = array[idx + size * i].real();
1107 tmp[2*i+1] = array[idx + size * i].imag();
1109 arg.dataOr.reconstruct.Unpack(data, tmp,
id, dir, 0, arg.dataOr.X, arg.dataOr.R);
1110 arg.dataOr.save(data,
id, dir, parity);
1115 template<
int NElems,
typename Float,
typename Gauge,
bool pack>
1116 __global__
void Kernel_UnPackTop(
int size, GaugeFixUnPackArg<Gauge> arg, complex<Float> *array,
int parity,
int face,
int dir){
1117 int idx = blockIdx.x * blockDim.x + threadIdx.x;
1118 if ( idx >= size )
return;
1120 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
1123 int borderid = arg.X[face] - 1;
1126 za = idx / ( X[1] / 2);
1128 x[2] = za - x[3] * X[2];
1130 xodd = (borderid + x[2] + x[3] +
parity) & 1;
1131 x[1] = (2 * idx + xodd) - za * X[1];
1134 za = idx / ( X[0] / 2);
1136 x[2] = za - x[3] * X[2];
1138 xodd = (borderid + x[2] + x[3] +
parity) & 1;
1139 x[0] = (2 * idx + xodd) - za * X[0];
1142 za = idx / ( X[0] / 2);
1144 x[1] = za - x[3] * X[1];
1146 xodd = (borderid + x[1] + x[3] +
parity) & 1;
1147 x[0] = (2 * idx + xodd) - za * X[0];
1150 za = idx / ( X[0] / 2);
1152 x[1] = za - x[2] * X[1];
1154 xodd = (borderid + x[1] + x[2] +
parity) & 1;
1155 x[0] = (2 * idx + xodd) - za * X[0];
1158 for (
int dr = 0; dr < 4; ++dr ) {
1159 x[dr] += arg.border[dr];
1160 X[dr] += 2 * arg.border[dr];
1162 int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
1163 typedef complex<Float> Cmplx;
1165 RegType tmp[NElems];
1168 arg.dataOr.load(data,
id, dir, parity);
1169 arg.dataOr.reconstruct.Pack(tmp, data,
id);
1170 for (
int i = 0; i < NElems / 2; ++i ) array[idx + size * i] = Cmplx(tmp[2*i+0], tmp[2*i+1]);
1173 for (
int i = 0; i < NElems / 2; ++i ) {
1174 tmp[2*i+0] = array[idx + size * i].real();
1175 tmp[2*i+1] = array[idx + size * i].imag();
1177 arg.dataOr.reconstruct.Unpack(data, tmp,
id, dir, 0, arg.dataOr.X, arg.dataOr.R);
1178 arg.dataOr.save(data,
id, dir, parity);
1184 template<
typename Float,
typename Gauge,
int NElems,
int gauge_dir>
1186 const int Nsteps,
const int verbose_interval,
1187 const Float relax_boost,
const double tolerance,
1188 const int reunit_interval,
const int stopWtheta) {
1191 TimeProfile profileInternalGaugeFixOVR(
"InternalGaugeFixQudaOVR",
false);
1197 printfQuda(
"\tOverrelaxation boost parameter: %lf\n", (
double)relax_boost);
1198 printfQuda(
"\tStop criterium: %lf\n", tolerance);
1199 if ( stopWtheta )
printfQuda(
"\tStop criterium method: theta\n");
1200 else printfQuda(
"\tStop criterium method: Delta\n");
1201 printfQuda(
"\tMaximum number of iterations: %d\n", Nsteps);
1202 printfQuda(
"\tReunitarize at every %d steps\n", reunit_interval);
1203 printfQuda(
"\tPrint convergence results at every %d steps\n", verbose_interval);
1207 const double max_error = 1e-10;
1213 reunit_allow_svd, reunit_svd_only,
1214 svd_rel_error, svd_abs_error);
1217 cudaMemset(num_failures_dev, 0,
sizeof(
int));
1219 GaugeFixQualityArg<Gauge> argQ(dataOr, data);
1220 GaugeFixQuality<Float,Gauge, gauge_dir> GaugeFixQuality(argQ);
1222 GaugeFixArg<Float, Gauge>
arg(dataOr, data, relax_boost);
1223 GaugeFix<Float,Gauge, gauge_dir> gaugeFix(arg);
1234 void *hostbuffer_h[4];
1235 cudaStream_t GFStream[9];
1238 size_t faceVolume[4];
1239 size_t faceVolumeCB[4];
1251 for (
int dir = 0; dir < 4; ++dir ) {
1252 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
1255 for (
int i = 0; i < 4; i++ ) {
1257 for (
int j = 0; j < 4; j++ ) {
1258 if ( i == j )
continue;
1259 faceVolume[i] *= X[j];
1261 faceVolumeCB[i] = faceVolume[i] / 2;
1264 for (
int d = 0; d < 4; d++ ) {
1266 offset[d] = faceVolumeCB[d] * NElems;
1267 bytes[d] =
sizeof(Float) * offset[d];
1272 cudaStreamCreate(&GFStream[d]);
1273 cudaStreamCreate(&GFStream[4 + d]);
1277 block[d] = make_uint3(128, 1, 1);
1278 grid[d] = make_uint3((faceVolumeCB[d] + block[d].x - 1) / block[d].x, 1, 1);
1280 cudaStreamCreate(&GFStream[8]);
1281 for (
int d = 0; d < 4; d++ ) {
1284 recv[d] = recv_d[d];
1285 send[d] = send_d[d];
1286 recvg[d] = recvg_d[d];
1287 sendg[d] = sendg_d[d];
1289 recv[d] = hostbuffer_h[d];
1290 send[d] =
static_cast<char*
>(hostbuffer_h[d]) + bytes[d];
1291 recvg[d] =
static_cast<char*
>(hostbuffer_h[d]) + 3 * bytes[d];
1292 sendg[d] =
static_cast<char*
>(hostbuffer_h[d]) + 2 * bytes[d];
1300 GaugeFixUnPackArg<Gauge> dataexarg(dataOr, data);
1301 GaugeFixBorderPointsArg<Float, Gauge> argBorder(dataOr, data, relax_boost, faceVolume, faceVolumeCB);
1302 GaugeFixBorderPoints<Float,Gauge, gauge_dir> gfixBorderPoints(argBorder);
1303 GaugeFixInteriorPointsArg<Float, Gauge> argInt(dataOr, data, relax_boost);
1304 GaugeFixInteriorPoints<Float,Gauge, gauge_dir> gfixIntPoints(argInt);
1307 GaugeFixQuality.apply(0);
1308 flop += (double)GaugeFixQuality.flops();
1309 byte += (double)GaugeFixQuality.bytes();
1310 double action0 = argQ.getAction();
1311 printfQuda(
"Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
1315 qudaMemcpy(&num_failures, num_failures_dev,
sizeof(
int), cudaMemcpyDeviceToHost);
1316 if ( num_failures > 0 ) {
1318 errorQuda(
"Error in the unitarization\n");
1321 cudaMemset(num_failures_dev, 0,
sizeof(
int));
1324 for ( iter = 0; iter < Nsteps; iter++ ) {
1325 for (
int p = 0; p < 2; p++ ) {
1327 gaugeFix.setParity(p);
1329 flop += (double)gaugeFix.flops();
1330 byte += (double)gaugeFix.bytes();
1333 gaugeFix.setParity(p);
1335 flop += (double)gaugeFix.flops();
1336 byte += (double)gaugeFix.bytes();
1339 gfixIntPoints.setParity(p);
1340 gfixBorderPoints.setParity(p);
1341 gfixBorderPoints.apply(0);
1342 flop += (double)gfixBorderPoints.flops();
1343 byte += (double)gfixBorderPoints.bytes();
1344 flop += (double)gfixIntPoints.flops();
1345 byte += (double)gfixIntPoints.bytes();
1346 for (
int d = 0; d < 4; d++ ) {
1353 for (
int d = 0; d < 4; d++ ) {
1356 Kernel_UnPackTop<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[d] >>> (faceVolumeCB[d], dataexarg,
reinterpret_cast<complex<Float>*
>(send_d[d]), p, d, d);
1358 Kernel_UnPackGhost<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[4 + d] >>> (faceVolumeCB[d], dataexarg,
reinterpret_cast<complex<Float>*
>(sendg_d[d]), 1 - p, d, d);
1361 for (
int d = 0; d < 4; d++ ) {
1369 for (
int d = 0; d < 4; d++ ) {
1371 cudaMemcpyAsync(send[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[d]);
1373 for (
int d = 0; d < 4; d++ ) {
1375 cudaMemcpyAsync(sendg[d], sendg_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[4 + d]);
1379 gfixIntPoints.apply(GFStream[8]);
1382 for (
int d = 0; d < 4; d++ ) {
1389 for (
int d = 0; d < 4; d++ ) {
1392 cudaMemcpyAsync(recv_d[d], recv[d], bytes[d], cudaMemcpyHostToDevice, GFStream[d]);
1394 for (
int d = 0; d < 4; d++ ) {
1397 cudaMemcpyAsync(recvg_d[d], recvg[d], bytes[d], cudaMemcpyHostToDevice, GFStream[4 + d]);
1400 for (
int d = 0; d < 4; d++ ) {
1405 Kernel_UnPackGhost<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[d] >>> (faceVolumeCB[d], dataexarg,
reinterpret_cast<complex<Float>*
>(recv_d[d]), p, d, d);
1407 for (
int d = 0; d < 4; d++ ) {
1412 Kernel_UnPackTop<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[4 + d] >>> (faceVolumeCB[d], dataexarg,
reinterpret_cast<complex<Float>*
>(recvg_d[d]), 1 - p, d, d);
1414 for (
int d = 0; d < 4; d++ ) {
1470 if ((iter % reunit_interval) == (reunit_interval - 1)) {
1472 qudaMemcpy(&num_failures, num_failures_dev,
sizeof(
int), cudaMemcpyDeviceToHost);
1473 if ( num_failures > 0 )
errorQuda(
"Error in the unitarization\n");
1474 cudaMemset(num_failures_dev, 0,
sizeof(
int));
1475 flop += 4588.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3];
1476 byte += 8.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3] * dataOr.Bytes();
1478 GaugeFixQuality.apply(0);
1479 flop += (double)GaugeFixQuality.flops();
1480 byte += (double)GaugeFixQuality.bytes();
1481 double action = argQ.getAction();
1482 double diff =
abs(action0 - action);
1483 if ((iter % verbose_interval) == (verbose_interval - 1))
1484 printfQuda(
"Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
1486 if ( argQ.getTheta() < tolerance )
break;
1489 if ( diff < tolerance )
break;
1493 if ((iter % reunit_interval) != 0 ) {
1495 qudaMemcpy(&num_failures, num_failures_dev,
sizeof(
int), cudaMemcpyDeviceToHost);
1496 if ( num_failures > 0 )
errorQuda(
"Error in the unitarization\n");
1497 cudaMemset(num_failures_dev, 0,
sizeof(
int));
1498 flop += 4588.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3];
1499 byte += 8.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3] * dataOr.Bytes();
1501 if ((iter % verbose_interval) != 0 ) {
1502 GaugeFixQuality.apply(0);
1503 flop += (double)GaugeFixQuality.flops();
1504 byte += (double)GaugeFixQuality.bytes();
1505 double action = argQ.getAction();
1506 double diff =
abs(action0 - action);
1507 printfQuda(
"Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
1513 for (
int d = 0; d < 4; d++ ) {
1523 cudaStreamDestroy(GFStream[d]);
1524 cudaStreamDestroy(GFStream[4 + d]);
1530 cudaStreamDestroy(GFStream[8]);
1538 double gflops = (flop * 1e-9) / (secs);
1539 double gbytes = byte / (secs * 1e9);
1543 printfQuda(
"Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
1548 template<
typename Float,
int NElems,
typename Gauge>
1550 const Float relax_boost,
const double tolerance,
const int reunit_interval,
const int stopWtheta) {
1551 if ( gauge_dir != 3 ) {
1552 printfQuda(
"Starting Landau gauge fixing...\n");
1553 gaugefixingOVR<Float, Gauge, NElems, 4>(dataOr, data, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1556 printfQuda(
"Starting Coulomb gauge fixing...\n");
1557 gaugefixingOVR<Float, Gauge, NElems, 3>(dataOr, data, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1563 template<
typename Float>
1565 const Float relax_boost,
const double tolerance,
const int reunit_interval,
const int stopWtheta) {
1573 gaugefixingOVR<Float, 18>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1578 gaugefixingOVR<Float, 12>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1583 gaugefixingOVR<Float, 8>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1592 #endif // GPU_GAUGE_ALG 1607 const double tolerance,
const int reunit_interval,
const int stopWtheta) {
1608 #ifdef GPU_GAUGE_ALG 1610 errorQuda(
"Half precision not supported\n");
1613 gaugefixingOVR<float> (data, gauge_dir, Nsteps, verbose_interval, (float)relax_boost, tolerance, reunit_interval, stopWtheta);
1615 gaugefixingOVR<double>(data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1620 errorQuda(
"Gauge fixing has not been built");
1621 #endif // GPU_GAUGE_ALG static bool reunit_allow_svd
#define qudaMemcpy(dst, src, count, kind)
int commDimPartitioned(int dir)
#define pinned_malloc(size)
__device__ __host__ void setZero(Matrix< T, N > *m)
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
cudaColorSpinorField * tmp
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
void comm_allreduce_array(double *data, size_t size)
virtual std::string paramString(const TuneParam ¶m) const
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
static double svd_rel_error
#define qudaDeviceSynchronize()
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
static bool reunit_svd_only
void comm_start(MsgHandle *mh)
#define pool_device_malloc(size)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
static double svd_abs_error
Main header file for host and device accessors to GaugeFields.
void comm_free(MsgHandle *&mh)
std::complex< double > Complex
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
static double unitarize_eps
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
QudaReconstructType Reconstruct() const
__host__ __device__ ValueType abs(ValueType x)
#define pool_device_free(ptr)
void comm_wait(MsgHandle *mh)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
int comm_dim_partitioned(int dim)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.