18 static int numParams = 18;
20 #define LAUNCH_KERNEL_GAUGEFIX(kernel, tp, stream, arg, parity, ...) \ 21 if ( tp.block.z == 0 ) { \ 22 switch ( tp.block.x ) { \ 24 kernel<0, 32,__VA_ARGS__> \ 25 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 28 kernel<0, 64,__VA_ARGS__> \ 29 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 32 kernel<0, 96,__VA_ARGS__> \ 33 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 36 kernel<0, 128,__VA_ARGS__> \ 37 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 40 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 43 else if ( tp.block.z == 1 ) { \ 44 switch ( tp.block.x ) { \ 46 kernel<1, 32,__VA_ARGS__> \ 47 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 50 kernel<1, 64,__VA_ARGS__> \ 51 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 54 kernel<1, 96,__VA_ARGS__> \ 55 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 58 kernel<1, 128,__VA_ARGS__> \ 59 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 62 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 65 else if ( tp.block.z == 2 ) { \ 66 switch ( tp.block.x ) { \ 68 kernel<2, 32,__VA_ARGS__> \ 69 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 72 kernel<2, 64,__VA_ARGS__> \ 73 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 76 kernel<2, 96,__VA_ARGS__> \ 77 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 80 kernel<2, 128,__VA_ARGS__> \ 81 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 84 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 87 else if ( tp.block.z == 3 ) { \ 88 switch ( tp.block.x ) { \ 90 kernel<3, 32,__VA_ARGS__> \ 91 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 94 kernel<3, 64,__VA_ARGS__> \ 95 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 98 kernel<3, 96,__VA_ARGS__> \ 99 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 102 kernel<3, 128,__VA_ARGS__> \ 103 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 106 kernel<3, 160,__VA_ARGS__> \ 107 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 110 kernel<3, 192,__VA_ARGS__> \ 111 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 114 kernel<3, 224,__VA_ARGS__> \ 115 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 118 kernel<3, 256,__VA_ARGS__> \ 119 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 122 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 125 else if ( tp.block.z == 4 ) { \ 126 switch ( tp.block.x ) { \ 128 kernel<4, 32,__VA_ARGS__> \ 129 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 132 kernel<4, 64,__VA_ARGS__> \ 133 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 136 kernel<4, 96,__VA_ARGS__> \ 137 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 140 kernel<4, 128,__VA_ARGS__> \ 141 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 144 kernel<4, 160,__VA_ARGS__> \ 145 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 148 kernel<4, 192,__VA_ARGS__> \ 149 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 152 kernel<4, 224,__VA_ARGS__> \ 153 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 156 kernel<4, 256,__VA_ARGS__> \ 157 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 160 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 163 else if ( tp.block.z == 5 ) { \ 164 switch ( tp.block.x ) { \ 166 kernel<5, 32,__VA_ARGS__> \ 167 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 170 kernel<5, 64,__VA_ARGS__> \ 171 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 174 kernel<5, 96,__VA_ARGS__> \ 175 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 178 kernel<5, 128,__VA_ARGS__> \ 179 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 182 kernel<5, 160,__VA_ARGS__> \ 183 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 186 kernel<5, 192,__VA_ARGS__> \ 187 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 190 kernel<5, 224,__VA_ARGS__> \ 191 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 194 kernel<5, 256,__VA_ARGS__> \ 195 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 198 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 202 errorQuda("Not implemented for %d", tp.block.z); \ 209 template <
typename Gauge>
210 struct GaugeFixQualityArg :
public ReduceArg<double2> {
217 GaugeFixQualityArg(
const Gauge &dataOr,
const cudaGaugeField &data)
220 for (
int dir = 0; dir < 4; ++dir ) {
221 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
223 border[dir] = data.
R()[dir];
226 threads =
X[0]*
X[1]*
X[2]*
X[3]/2;
228 double getAction(){
return result_h[0].x; }
229 double getTheta(){
return result_h[0].y; }
236 template<
int blockSize,
typename Float,
typename Gauge,
int gauge_dir>
237 __global__
void computeFix_quality(GaugeFixQualityArg<Gauge> argQ){
238 typedef complex<Float> Cmplx;
243 double2 data = make_double2(0.0,0.0);
244 if (
idx < argQ.threads ) {
247 for (
int dr = 0; dr < 4; ++dr )
X[dr] = argQ.X[dr];
253 for (
int dr = 0; dr < 4; ++dr ) {
254 x[dr] += argQ.border[dr];
255 X[dr] += 2 * argQ.border[dr];
262 for (
int mu = 0;
mu < gauge_dir;
mu++ ) {
271 for (
int mu = 0;
mu < gauge_dir;
mu++ ) {
285 reduce2d<blockSize,2>(argQ, data);
292 template<
typename Float,
typename Gauge,
int gauge_dir>
294 GaugeFixQualityArg<Gauge> argQ;
295 mutable char aux_string[128];
298 unsigned int minThreads()
const {
return argQ.threads; }
301 GaugeFixQuality(GaugeFixQualityArg<Gauge> &argQ) : argQ(argQ) { }
302 ~GaugeFixQuality () { }
304 void apply(
const cudaStream_t &
stream){
306 argQ.result_h[0] = make_double2(0.0,0.0);
310 argQ.result_h[0].x /= (
double)(3 * gauge_dir * 2 * argQ.threads *
comm_size());
315 std::stringstream vol;
316 vol << argQ.X[0] <<
"x";
317 vol << argQ.X[1] <<
"x";
318 vol << argQ.X[2] <<
"x";
320 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d",argQ.threads,
sizeof(Float),gauge_dir);
321 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
325 std::stringstream ps;
326 ps <<
"block=(" <<
param.block.x <<
"," <<
param.block.y <<
"," <<
param.block.z <<
")";
327 ps <<
"shared=" <<
param.shared_bytes;
330 long long flops()
const {
331 return (36LL * gauge_dir + 65LL) * 2 * argQ.threads;
334 long long bytes()
const {
335 return 2LL * gauge_dir * 2 * argQ.threads * numParams *
sizeof(Float);
344 template <
typename Float,
typename Gauge>
353 const Float relax_boost;
355 GaugeFixArg(Gauge & dataOr,
cudaGaugeField & data,
const Float relax_boost)
356 : dataOr(dataOr), data(data), relax_boost(relax_boost) {
358 for (
int dir = 0; dir < 4; ++dir ) {
359 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
361 border[dir] = data.
R()[dir];
364 threads =
X[0] *
X[1] *
X[2] *
X[3] >> 1;
374 template<
int ImplementationType,
int blockSize,
typename Float,
typename Gauge,
int gauge_dir>
375 __global__
void computeFix(GaugeFixArg<Float, Gauge>
arg,
int parity){
376 typedef complex<Float> Cmplx;
381 if (
idx >=
arg.threads )
return;
384 if ( ImplementationType < 3 ) {
387 for (
int dr = 0; dr < 4; ++dr )
X[dr] =
arg.X[dr];
393 for (
int dr = 0; dr < 4; ++dr ) {
394 x[dr] +=
arg.border[dr];
395 X[dr] += 2 *
arg.border[dr];
405 idx = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
410 if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link,
arg.relax_boost, tid);
412 if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link,
arg.relax_boost, tid);
415 if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link,
arg.relax_boost, tid);
416 arg.dataOr.save((Float*)(link.data),
idx,
mu, oddbit);
422 for (
int dr = 0; dr < 4; ++dr )
X[dr] =
arg.X[dr];
428 for (
int dr = 0; dr < 4; ++dr ) {
429 x[dr] +=
arg.border[dr];
430 X[dr] += 2 *
arg.border[dr];
434 idx = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
441 int idx1 = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
448 if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1,
arg.relax_boost, tid);
450 if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1,
arg.relax_boost, tid);
453 if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1,
arg.relax_boost, tid);
456 arg.dataOr.save((Float*)(link1.data),idx1,
mu, 1 -
parity);
469 template<
typename Float,
typename Gauge,
int gauge_dir>
471 GaugeFixArg<Float, Gauge>
arg;
473 mutable char aux_string[128];
476 dim3 createGrid (
const dim3 &
block)
const {
477 unsigned int blockx =
block.x / 8;
479 unsigned int gx = (
arg.threads + blockx - 1) / blockx;
480 return dim3(gx, 1, 1);
485 const unsigned int min_threads0 = 32 * 8;
486 const unsigned int min_threads1 = 32 * 4;
487 const unsigned int max_threads = 1024;
488 const unsigned int atmadd = 0;
489 unsigned int min_threads = min_threads0;
490 param.block.z += atmadd;
491 if (
param.block.z > 2 ) min_threads = 32 * 4;
492 param.block.x += min_threads;
498 if ((
param.block.x >= min_threads) && (
param.block.x <= max_threads)) {
499 if (
param.block.z == 0 )
param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
500 else if (
param.block.z == 1 ||
param.block.z == 2 )
param.shared_bytes =
param.block.x * 4 *
sizeof(Float) / 8;
501 else if (
param.block.z == 3 )
param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
502 else if (
param.block.z == 4 ||
param.block.z == 5 )
param.shared_bytes =
param.block.x *
sizeof(Float);
505 else if (
param.block.z == 0 ) {
506 param.block.x = min_threads0;
510 param.shared_bytes =
param.block.x * 4 *
sizeof(Float) / 8;
513 else if (
param.block.z == 1 ) {
514 param.block.x = min_threads0;
518 param.shared_bytes =
param.block.x * 4 *
sizeof(Float) / 8;
521 else if (
param.block.z == 2 ) {
522 param.block.x = min_threads1;
526 param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
529 else if (
param.block.z == 3 ) {
530 param.block.x = min_threads1;
534 param.shared_bytes =
param.block.x *
sizeof(Float);
537 else if (
param.block.z == 4 ) {
538 param.block.x = min_threads1;
542 param.shared_bytes =
param.block.x *
sizeof(Float);
549 unsigned int sharedBytesPerThread()
const {
553 if (
param.block.z == 0 )
return param.block.x * 4 *
sizeof(Float);
554 else if (
param.block.z == 1 ||
param.block.z == 2 )
return param.block.x * 4 *
sizeof(Float) / 8;
555 else if (
param.block.z == 3 )
return param.block.x * 4 *
sizeof(Float);
556 else return param.block.x *
sizeof(Float);
559 bool tuneSharedBytes()
const {
562 bool tuneGridDim()
const {
565 unsigned int minThreads()
const {
571 param.block = dim3(256, 1, 0);
573 param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
579 void setParity(
const int par){
583 void apply(
const cudaStream_t &
stream){
585 LAUNCH_KERNEL_GAUGEFIX(computeFix, tp,
stream,
arg,
parity, Float, Gauge, gauge_dir);
590 initTuneParam(
param);
594 std::stringstream vol;
595 vol <<
arg.X[0] <<
"x";
596 vol <<
arg.X[1] <<
"x";
597 vol <<
arg.X[2] <<
"x";
599 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d",
arg.threads,
sizeof(Float),gauge_dir);
600 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
605 ps <<
", atomicadd=" <<
param.block.z;
616 long long flops()
const {
617 return 3LL * (22 + 28 * gauge_dir + 224 * 3) *
arg.threads;
620 long long bytes()
const {
621 return 8LL * 2 *
arg.threads * numParams *
sizeof(Float);
629 template <
typename Float,
typename Gauge>
630 struct GaugeFixInteriorPointsArg {
638 const Float relax_boost;
639 GaugeFixInteriorPointsArg(Gauge & dataOr,
cudaGaugeField & data,
const Float relax_boost)
640 : dataOr(dataOr), data(data), relax_boost(relax_boost) {
643 for (
int dir = 0; dir < 4; ++dir ) {
645 else border[dir] = 0;
647 for (
int dir = 0; dir < 4; ++dir )
X[dir] = data.
X()[dir] - border[dir] * 2;
649 for (
int dir = 0; dir < 4; ++dir )
X[dir] = data.
X()[dir];
651 threads =
X[0] *
X[1] *
X[2] *
X[3] >> 1;
661 template<
int ImplementationType,
int blockSize,
typename Float,
typename Gauge,
int gauge_dir>
662 __global__
void computeFixInteriorPoints(GaugeFixInteriorPointsArg<Float, Gauge>
arg,
int parity){
665 if (
idx >=
arg.threads )
return;
666 typedef complex<Float>
Complex;
669 for (
int dr = 0; dr < 4; ++dr )
X[dr] =
arg.X[dr];
672 int za = (
idx / (
X[0] / 2));
673 int zb = (
za /
X[1]);
676 x[2] =
zb -
x[3] *
X[2];
677 int p = 0;
for (
int dr = 0; dr < 4; ++dr )
p +=
arg.border[dr];
679 int x1odd = (
x[1] +
x[2] +
x[3] +
parity +
p) & 1;
681 x[0] = (2 *
idx + x1odd) -
za *
X[0];
682 for (
int dr = 0; dr < 4; ++dr ) {
683 x[dr] +=
arg.border[dr];
684 X[dr] += 2 *
arg.border[dr];
692 if ( ImplementationType < 3 ) {
698 idx = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
703 if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link,
arg.relax_boost, tid);
705 if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link,
arg.relax_boost, tid);
708 if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link,
arg.relax_boost, tid);
713 idx = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
719 int idx1 = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
725 if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1,
arg.relax_boost, tid);
727 if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1,
arg.relax_boost, tid);
730 if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1,
arg.relax_boost, tid);
733 arg.dataOr.save((Float*)(link1.data),idx1,
mu, 1 -
parity);
748 template<
typename Float,
typename Gauge,
int gauge_dir>
749 class GaugeFixInteriorPoints :
Tunable {
750 GaugeFixInteriorPointsArg<Float, Gauge>
arg;
752 mutable char aux_string[128];
755 dim3 createGrid (
const dim3 &
block)
const {
756 unsigned int blockx =
block.x / 8;
758 unsigned int gx = (
arg.threads + blockx - 1) / blockx;
759 return dim3(gx, 1, 1);
764 const unsigned int min_threads0 = 32 * 8;
765 const unsigned int min_threads1 = 32 * 4;
766 const unsigned int max_threads = 1024;
767 const unsigned int atmadd = 0;
768 unsigned int min_threads = min_threads0;
769 param.block.z += atmadd;
770 if (
param.block.z > 2 ) min_threads = 32 * 4;
771 param.block.x += min_threads;
777 if ((
param.block.x >= min_threads) && (
param.block.x <= max_threads)) {
778 if (
param.block.z == 0 )
param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
779 else if (
param.block.z == 1 ||
param.block.z == 2 )
param.shared_bytes =
param.block.x * 4 *
sizeof(Float) / 8;
780 else if (
param.block.z == 3 )
param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
781 else if (
param.block.z == 4 ||
param.block.z == 5 )
param.shared_bytes =
param.block.x *
sizeof(Float);
784 else if (
param.block.z == 0 ) {
785 param.block.x = min_threads0;
789 param.shared_bytes =
param.block.x * 4 *
sizeof(Float) / 8;
792 else if (
param.block.z == 1 ) {
793 param.block.x = min_threads0;
797 param.shared_bytes =
param.block.x * 4 *
sizeof(Float) / 8;
800 else if (
param.block.z == 2 ) {
801 param.block.x = min_threads1;
805 param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
808 else if (
param.block.z == 3 ) {
809 param.block.x = min_threads1;
813 param.shared_bytes =
param.block.x *
sizeof(Float);
816 else if (
param.block.z == 4 ) {
817 param.block.x = min_threads1;
821 param.shared_bytes =
param.block.x *
sizeof(Float);
828 unsigned int sharedBytesPerThread()
const {
832 if (
param.block.z == 0 )
return param.block.x * 4 *
sizeof(Float);
833 else if (
param.block.z == 1 ||
param.block.z == 2 )
return param.block.x * 4 *
sizeof(Float) / 8;
834 else if (
param.block.z == 3 )
return param.block.x * 4 *
sizeof(Float);
835 else return param.block.x *
sizeof(Float);
838 bool tuneSharedBytes()
const {
841 bool tuneGridDim()
const {
844 unsigned int minThreads()
const {
850 param.block = dim3(256, 1, 0);
852 param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
854 GaugeFixInteriorPoints(GaugeFixInteriorPointsArg<Float, Gauge> &
arg) :
arg(
arg),
parity(0) { }
855 ~GaugeFixInteriorPoints () { }
856 void setParity(
const int par){
860 void apply(
const cudaStream_t &
stream){
862 LAUNCH_KERNEL_GAUGEFIX(computeFixInteriorPoints, tp,
stream,
arg,
parity, Float, Gauge, gauge_dir);
868 initTuneParam(
param);
872 std::stringstream vol;
873 vol <<
arg.X[0] <<
"x";
874 vol <<
arg.X[1] <<
"x";
875 vol <<
arg.X[2] <<
"x";
877 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d",
arg.threads,
sizeof(Float),gauge_dir);
878 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
883 ps <<
", atomicadd=" <<
param.block.z;
894 long long flops()
const {
895 return 3LL * (22 + 28 * gauge_dir + 224 * 3) *
arg.threads;
898 long long bytes()
const {
899 return 8LL * 2 *
arg.threads * numParams *
sizeof(Float);
918 template <
typename Float,
typename Gauge>
919 struct GaugeFixBorderPointsArg {
923 int *borderpoints[2];
924 int *faceindicessize[2];
926 size_t faceVolumeCB[4];
929 const Float relax_boost;
931 GaugeFixBorderPointsArg(Gauge & dataOr,
cudaGaugeField & data,
const Float relax_boost,
size_t faceVolume_[4],
size_t faceVolumeCB_[4])
932 : dataOr(dataOr), data(data), relax_boost(relax_boost) {
935 for (
int dir = 0; dir < 4; ++dir ) {
936 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
937 border[dir] = data.
R()[dir];
945 for (
int dir = 0; dir < 4; ++dir ) {
947 faceVolumeCB[dir] = faceVolumeCB_[dir];
956 template<
int ImplementationType,
int blockSize,
typename Float,
typename Gauge,
int gauge_dir>
957 __global__
void computeFixBorderPoints(GaugeFixBorderPointsArg<Float, Gauge>
arg,
int parity){
958 typedef complex<Float> Cmplx;
962 if (
idx >=
arg.threads )
return;
971 for (
int dr = 0; dr < 4; ++dr )
x[dr] +=
arg.border[dr];
973 for (
int dr = 0; dr < 4; ++dr )
X[dr] =
arg.X[dr] + 2 *
arg.border[dr];
976 if ( ImplementationType < 3 ) {
982 idx = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
987 if ( ImplementationType == 0 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link,
arg.relax_boost, tid);
989 if ( ImplementationType == 1 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link,
arg.relax_boost, tid);
992 if ( ImplementationType == 2 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link,
arg.relax_boost, tid);
997 idx = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
1003 int idx1 = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
1009 if ( ImplementationType == 3 ) GaugeFixHit_NoAtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1,
arg.relax_boost, tid);
1011 if ( ImplementationType == 4 ) GaugeFixHit_AtomicAdd<blockSize, Float, gauge_dir, 3>(link, link1,
arg.relax_boost, tid);
1014 if ( ImplementationType == 5 ) GaugeFixHit_NoAtomicAdd_LessSM<blockSize, Float, gauge_dir, 3>(link, link1,
arg.relax_boost, tid);
1017 arg.dataOr.save((Float*)(link1.data),idx1,
mu, 1 -
parity);
1027 template<
typename Float,
typename Gauge,
int gauge_dir>
1028 class GaugeFixBorderPoints :
Tunable {
1029 GaugeFixBorderPointsArg<Float, Gauge>
arg;
1031 mutable char aux_string[128];
1034 dim3 createGrid (
const dim3 &
block)
const {
1035 unsigned int blockx =
block.x / 8;
1037 unsigned int gx = (
arg.threads + blockx - 1) / blockx;
1038 return dim3(gx, 1, 1);
1043 const unsigned int min_threads0 = 32 * 8;
1044 const unsigned int min_threads1 = 32 * 4;
1045 const unsigned int max_threads = 1024;
1046 const unsigned int atmadd = 0;
1047 unsigned int min_threads = min_threads0;
1048 param.block.z += atmadd;
1049 if (
param.block.z > 2 ) min_threads = 32 * 4;
1050 param.block.x += min_threads;
1056 if ((
param.block.x >= min_threads) && (
param.block.x <= max_threads)) {
1057 if (
param.block.z == 0 )
param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
1058 else if (
param.block.z == 1 ||
param.block.z == 2 )
param.shared_bytes =
param.block.x * 4 *
sizeof(Float) / 8;
1059 else if (
param.block.z == 3 )
param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
1060 else if (
param.block.z == 4 ||
param.block.z == 5 )
param.shared_bytes =
param.block.x *
sizeof(Float);
1063 else if (
param.block.z == 0 ) {
1064 param.block.x = min_threads0;
1068 param.shared_bytes =
param.block.x * 4 *
sizeof(Float) / 8;
1071 else if (
param.block.z == 1 ) {
1072 param.block.x = min_threads0;
1076 param.shared_bytes =
param.block.x * 4 *
sizeof(Float) / 8;
1079 else if (
param.block.z == 2 ) {
1080 param.block.x = min_threads1;
1084 param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
1087 else if (
param.block.z == 3 ) {
1088 param.block.x = min_threads1;
1092 param.shared_bytes =
param.block.x *
sizeof(Float);
1095 else if (
param.block.z == 4 ) {
1096 param.block.x = min_threads1;
1100 param.shared_bytes =
param.block.x *
sizeof(Float);
1107 unsigned int sharedBytesPerThread()
const {
1111 if (
param.block.z == 0 )
return param.block.x * 4 *
sizeof(Float);
1112 else if (
param.block.z == 1 ||
param.block.z == 2 )
return param.block.x * 4 *
sizeof(Float) / 8;
1113 else if (
param.block.z == 3 )
return param.block.x * 4 *
sizeof(Float);
1114 else return param.block.x *
sizeof(Float);
1117 bool tuneSharedBytes()
const {
1120 bool tuneGridDim()
const {
1123 unsigned int minThreads()
const {
1129 param.block = dim3(256, 1, 0);
1131 param.shared_bytes =
param.block.x * 4 *
sizeof(Float);
1133 GaugeFixBorderPoints(GaugeFixBorderPointsArg<Float, Gauge> &
arg) :
arg(
arg),
parity(0) { }
1134 ~GaugeFixBorderPoints () {
1137 void setParity(
const int par){
1141 void apply(
const cudaStream_t &
stream){
1143 LAUNCH_KERNEL_GAUGEFIX(computeFixBorderPoints, tp,
stream,
arg,
parity, Float, Gauge, gauge_dir);
1148 initTuneParam(
param);
1152 std::stringstream vol;
1153 vol <<
arg.X[0] <<
"x";
1154 vol <<
arg.X[1] <<
"x";
1155 vol <<
arg.X[2] <<
"x";
1157 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d",
arg.threads,
sizeof(Float),gauge_dir);
1158 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
1163 ps <<
", atomicadd=" <<
param.block.z;
1174 long long flops()
const {
1175 return 3LL * (22 + 28 * gauge_dir + 224 * 3) *
arg.threads;
1178 long long bytes()
const {
1179 return 8LL * 2 *
arg.threads * numParams *
sizeof(Float);
1197 template <
typename Gauge>
1198 struct GaugeFixUnPackArg {
1206 for (
int dir = 0; dir < 4; ++dir ) {
1207 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
1209 border[dir] = data.
R()[dir];
1216 template<
int NElems,
typename Float,
typename Gauge,
bool pack>
1217 __global__
void Kernel_UnPackGhost(
int size, GaugeFixUnPackArg<Gauge>
arg, complex<Float> *
array,
int parity,
int face,
int dir){
1221 for (
int dr = 0; dr < 4; ++dr )
X[dr] =
arg.X[dr];
1230 x[2] =
za -
x[3] *
X[2];
1232 xodd = (borderid +
x[2] +
x[3] +
parity) & 1;
1233 x[1] = (2 *
idx + xodd) -
za *
X[1];
1238 x[2] =
za -
x[3] *
X[2];
1240 xodd = (borderid +
x[2] +
x[3] +
parity) & 1;
1241 x[0] = (2 *
idx + xodd) -
za *
X[0];
1246 x[1] =
za -
x[3] *
X[1];
1248 xodd = (borderid +
x[1] +
x[3] +
parity) & 1;
1249 x[0] = (2 *
idx + xodd) -
za *
X[0];
1254 x[1] =
za -
x[2] *
X[1];
1256 xodd = (borderid +
x[1] +
x[2] +
parity) & 1;
1257 x[0] = (2 *
idx + xodd) -
za *
X[0];
1260 for (
int dr = 0; dr < 4; ++dr ) {
1261 x[dr] +=
arg.border[dr];
1262 X[dr] += 2 *
arg.border[dr];
1266 int id = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
1267 typedef complex<Float> Cmplx;
1269 RegType
tmp[NElems];
1273 arg.dataOr.reconstruct.Pack(
tmp, data,
id);
1278 arg.dataOr.reconstruct.Unpack(data,
tmp,
id, dir, 0,
arg.dataOr.X,
arg.dataOr.R);
1286 template<
int NElems,
typename Float,
typename Gauge,
bool pack>
1287 __global__
void Kernel_UnPackTop(
int size, GaugeFixUnPackArg<Gauge>
arg, complex<Float> *
array,
int parity,
int face,
int dir){
1291 for (
int dr = 0; dr < 4; ++dr )
X[dr] =
arg.X[dr];
1294 int borderid =
arg.X[face] - 1;
1299 x[2] =
za -
x[3] *
X[2];
1301 xodd = (borderid +
x[2] +
x[3] +
parity) & 1;
1302 x[1] = (2 *
idx + xodd) -
za *
X[1];
1307 x[2] =
za -
x[3] *
X[2];
1309 xodd = (borderid +
x[2] +
x[3] +
parity) & 1;
1310 x[0] = (2 *
idx + xodd) -
za *
X[0];
1315 x[1] =
za -
x[3] *
X[1];
1317 xodd = (borderid +
x[1] +
x[3] +
parity) & 1;
1318 x[0] = (2 *
idx + xodd) -
za *
X[0];
1323 x[1] =
za -
x[2] *
X[1];
1325 xodd = (borderid +
x[1] +
x[2] +
parity) & 1;
1326 x[0] = (2 *
idx + xodd) -
za *
X[0];
1329 for (
int dr = 0; dr < 4; ++dr ) {
1330 x[dr] +=
arg.border[dr];
1331 X[dr] += 2 *
arg.border[dr];
1333 int id = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
1334 typedef complex<Float> Cmplx;
1336 RegType
tmp[NElems];
1340 arg.dataOr.reconstruct.Pack(
tmp, data,
id);
1345 arg.dataOr.reconstruct.Unpack(data,
tmp,
id, dir, 0,
arg.dataOr.X,
arg.dataOr.R);
1368 template<
typename Float,
typename Gauge,
int NElems,
int gauge_dir>
1370 const int Nsteps,
const int verbose_interval,
1371 const Float relax_boost,
const double tolerance,
1372 const int reunit_interval,
const int stopWtheta) {
1375 TimeProfile profileInternalGaugeFixOVR(
"InternalGaugeFixQudaOVR",
false);
1381 printfQuda(
"\tOverrelaxation boost parameter: %lf\n", (
double)relax_boost);
1382 printfQuda(
"\tStop criterium: %lf\n", tolerance);
1383 if ( stopWtheta )
printfQuda(
"\tStop criterium method: theta\n");
1384 else printfQuda(
"\tStop criterium method: Delta\n");
1385 printfQuda(
"\tMaximum number of iterations: %d\n", Nsteps);
1386 printfQuda(
"\tReunitarize at every %d steps\n", reunit_interval);
1387 printfQuda(
"\tPrint convergence results at every %d steps\n", verbose_interval);
1391 const double max_error = 1
e-10;
1403 GaugeFixQualityArg<Gauge> argQ(dataOr, data);
1404 GaugeFixQuality<Float,Gauge, gauge_dir> GaugeFixQuality(argQ);
1406 GaugeFixArg<Float, Gauge>
arg(dataOr, data, relax_boost);
1407 GaugeFix<Float,Gauge, gauge_dir> gaugeFix(
arg);
1418 void *hostbuffer_h[4];
1419 cudaStream_t GFStream[9];
1423 size_t faceVolumeCB[4];
1435 for (
int dir = 0; dir < 4; ++dir ) {
1436 X[dir] = data.
X()[dir] - data.
R()[dir] * 2;
1439 for (
int i = 0;
i < 4;
i++ ) {
1441 for (
int j = 0; j < 4; j++ ) {
1442 if (
i == j )
continue;
1448 for (
int d = 0;
d < 4;
d++ ) {
1450 offset[
d] = faceVolumeCB[
d] * NElems;
1456 cudaStreamCreate(&GFStream[
d]);
1457 cudaStreamCreate(&GFStream[4 +
d]);
1461 block[
d] = make_uint3(128, 1, 1);
1462 grid[
d] = make_uint3((faceVolumeCB[
d] +
block[
d].
x - 1) /
block[
d].
x, 1, 1);
1464 cudaStreamCreate(&GFStream[8]);
1465 for (
int d = 0;
d < 4;
d++ ) {
1468 recv[
d] = recv_d[
d];
1469 send[
d] = send_d[
d];
1470 recvg[
d] = recvg_d[
d];
1471 sendg[
d] = sendg_d[
d];
1473 recv[
d] = hostbuffer_h[
d];
1474 send[
d] =
static_cast<char*
>(hostbuffer_h[
d]) +
bytes[
d];
1475 recvg[
d] =
static_cast<char*
>(hostbuffer_h[
d]) + 3 *
bytes[
d];
1476 sendg[
d] =
static_cast<char*
>(hostbuffer_h[
d]) + 2 *
bytes[
d];
1484 GaugeFixUnPackArg<Gauge> dataexarg(dataOr, data);
1485 GaugeFixBorderPointsArg<Float, Gauge> argBorder(dataOr, data, relax_boost,
faceVolume, faceVolumeCB);
1486 GaugeFixBorderPoints<Float,Gauge, gauge_dir> gfixBorderPoints(argBorder);
1487 GaugeFixInteriorPointsArg<Float, Gauge> argInt(dataOr, data, relax_boost);
1488 GaugeFixInteriorPoints<Float,Gauge, gauge_dir> gfixIntPoints(argInt);
1491 GaugeFixQuality.apply(0);
1492 flop += (
double)GaugeFixQuality.flops();
1493 byte += (
double)GaugeFixQuality.bytes();
1494 double action0 = argQ.getAction();
1495 printfQuda(
"Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
1502 errorQuda(
"Error in the unitarization\n");
1508 for ( iter = 0; iter < Nsteps; iter++ ) {
1509 for (
int p = 0;
p < 2;
p++ ) {
1511 gaugeFix.setParity(
p);
1513 flop += (
double)gaugeFix.flops();
1514 byte += (
double)gaugeFix.bytes();
1517 gaugeFix.setParity(
p);
1519 flop += (
double)gaugeFix.flops();
1520 byte += (
double)gaugeFix.bytes();
1523 gfixIntPoints.setParity(
p);
1524 gfixBorderPoints.setParity(
p);
1525 gfixBorderPoints.apply(0);
1526 flop += (
double)gfixBorderPoints.flops();
1527 byte += (
double)gfixBorderPoints.bytes();
1528 flop += (
double)gfixIntPoints.flops();
1529 byte += (
double)gfixIntPoints.bytes();
1530 for (
int d = 0;
d < 4;
d++ ) {
1537 for (
int d = 0;
d < 4;
d++ ) {
1540 Kernel_UnPackTop<NElems, Float, Gauge, true><< < grid[
d],
block[
d], 0, GFStream[
d] >> > (faceVolumeCB[
d], dataexarg,
reinterpret_cast<complex<Float>*
>(send_d[
d]),
p,
d,
d);
1542 Kernel_UnPackGhost<NElems, Float, Gauge, true><< < grid[
d],
block[
d], 0, GFStream[4 +
d] >> > (faceVolumeCB[
d], dataexarg,
reinterpret_cast<complex<Float>*
>(sendg_d[
d]), 1 -
p,
d,
d);
1545 for (
int d = 0;
d < 4;
d++ ) {
1553 for (
int d = 0;
d < 4;
d++ ) {
1555 cudaMemcpyAsync(send[
d], send_d[
d],
bytes[
d], cudaMemcpyDeviceToHost, GFStream[
d]);
1557 for (
int d = 0;
d < 4;
d++ ) {
1559 cudaMemcpyAsync(sendg[
d], sendg_d[
d],
bytes[
d], cudaMemcpyDeviceToHost, GFStream[4 +
d]);
1563 gfixIntPoints.apply(GFStream[8]);
1566 for (
int d = 0;
d < 4;
d++ ) {
1573 for (
int d = 0;
d < 4;
d++ ) {
1576 cudaMemcpyAsync(recv_d[
d], recv[
d],
bytes[
d], cudaMemcpyHostToDevice, GFStream[
d]);
1578 for (
int d = 0;
d < 4;
d++ ) {
1581 cudaMemcpyAsync(recvg_d[
d], recvg[
d],
bytes[
d], cudaMemcpyHostToDevice, GFStream[4 +
d]);
1584 for (
int d = 0;
d < 4;
d++ ) {
1589 Kernel_UnPackGhost<NElems, Float, Gauge, false><< < grid[
d],
block[
d], 0, GFStream[
d] >> > (faceVolumeCB[
d], dataexarg,
reinterpret_cast<complex<Float>*
>(recv_d[
d]),
p,
d,
d);
1591 for (
int d = 0;
d < 4;
d++ ) {
1596 Kernel_UnPackTop<NElems, Float, Gauge, false><< < grid[
d],
block[
d], 0, GFStream[4 +
d] >> > (faceVolumeCB[
d], dataexarg,
reinterpret_cast<complex<Float>*
>(recvg_d[
d]), 1 -
p,
d,
d);
1598 for (
int d = 0;
d < 4;
d++ ) {
1654 if ((iter % reunit_interval) == (reunit_interval - 1)) {
1659 flop += 4588.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3];
1660 byte += 8.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3] * dataOr.Bytes();
1662 GaugeFixQuality.apply(0);
1663 flop += (
double)GaugeFixQuality.flops();
1664 byte += (
double)GaugeFixQuality.bytes();
1665 double action = argQ.getAction();
1666 double diff =
abs(action0 - action);
1667 if ((iter % verbose_interval) == (verbose_interval - 1))
1668 printfQuda(
"Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
1670 if ( argQ.getTheta() < tolerance )
break;
1673 if ( diff < tolerance )
break;
1677 if ((iter % reunit_interval) != 0 ) {
1682 flop += 4588.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3];
1683 byte += 8.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3] * dataOr.Bytes();
1685 if ((iter % verbose_interval) != 0 ) {
1686 GaugeFixQuality.apply(0);
1687 flop += (
double)GaugeFixQuality.flops();
1688 byte += (
double)GaugeFixQuality.bytes();
1689 double action = argQ.getAction();
1690 double diff =
abs(action0 - action);
1691 printfQuda(
"Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
1697 for (
int d = 0;
d < 4;
d++ ) {
1707 cudaStreamDestroy(GFStream[
d]);
1708 cudaStreamDestroy(GFStream[4 +
d]);
1714 cudaStreamDestroy(GFStream[8]);
1722 double gflops = (flop * 1
e-9) / (secs);
1723 double gbytes = byte / (secs * 1e9);
1727 printfQuda(
"Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
1732 template<
typename Float,
int NElems,
typename Gauge>
1734 const Float relax_boost,
const double tolerance,
const int reunit_interval,
const int stopWtheta) {
1735 if ( gauge_dir != 3 ) {
1736 printfQuda(
"Starting Landau gauge fixing...\n");
1737 gaugefixingOVR<Float, Gauge, NElems, 4>(dataOr, data, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1740 printfQuda(
"Starting Coulomb gauge fixing...\n");
1741 gaugefixingOVR<Float, Gauge, NElems, 3>(dataOr, data, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1747 template<
typename Float>
1749 const Float relax_boost,
const double tolerance,
const int reunit_interval,
const int stopWtheta) {
1757 gaugefixingOVR<Float, 18>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1762 gaugefixingOVR<Float, 12>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1767 gaugefixingOVR<Float, 8>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1776 #endif // GPU_GAUGE_ALG 1791 const double tolerance,
const int reunit_interval,
const int stopWtheta) {
1792 #ifdef GPU_GAUGE_ALG 1794 errorQuda(
"Half precision not supported\n");
1797 gaugefixingOVR<float> (data, gauge_dir, Nsteps, verbose_interval, (
float)relax_boost, tolerance, reunit_interval, stopWtheta);
1799 gaugefixingOVR<double>(data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
1804 errorQuda(
"Gauge fixing has not been built");
1805 #endif // GPU_GAUGE_ALG #define qudaMemcpy(dst, src, count, kind)
int commDimPartitioned(int dir)
#define pinned_malloc(size)
__device__ __host__ void setZero(Matrix< T, N > *m)
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
std::complex< double > Complex
static bool reunit_svd_only
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
cudaColorSpinorField * tmp
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
void comm_allreduce_array(double *data, size_t size)
virtual std::string paramString(const TuneParam ¶m) const
static double svd_rel_error
void gaugefixingOVR(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta)
Gauge fixing with overrelaxation with support for single and multi GPU.
void exit(int) __attribute__((noreturn))
void comm_free(MsgHandle *mh)
static unsigned int delta
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
for(int s=0;s< param.dc.Ls;s++)
static __inline__ size_t p
void comm_start(MsgHandle *mh)
#define pool_device_malloc(size)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
static double unitarize_eps
Main header file for host and device accessors to GaugeFields.
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
static bool reunit_allow_svd
static double svd_abs_error
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
QudaReconstructType Reconstruct() const
__host__ __device__ ValueType abs(ValueType x)
#define pool_device_free(ptr)
struct cudaExtent unsigned int cudaArray_t array
void comm_wait(MsgHandle *mh)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
QudaPrecision Precision() const
int comm_dim_partitioned(int dim)
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)