1 #ifndef _GAUGE_FIX_OVR_HIT_DEVF_H 2 #define _GAUGE_FIX_OVR_HIT_DEVF_H 16 __device__
inline operator T*()
18 extern __shared__
int __smem[];
22 __device__
inline operator const T*()
const 24 extern __shared__
int __smem[];
38 if (
block == 0 ) {
p = 0; q = 1; }
39 else if (
block == 1 ) {
p = 1; q = 2; }
47 while ( del_i < (NCOLORS - 1) && found == 0 ) {
49 for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
68 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
75 if ( threadIdx.x <
blockSize * 4 ) elems[threadIdx.x] = 0.0;
84 IndexBlock<NCOLORS>(
block,
p, q);
86 if ( threadIdx.x <
blockSize * 4 ) asq = -1.0;
93 atomicAdd(elems + tid, (link(
p,
p)).
x + (link(q,q)).
x);
95 atomicAdd(elems + tid +
blockSize, (link(
p,q).
y + link(q,
p).
y) * asq);
97 atomicAdd(elems + tid +
blockSize * 2, (link(
p,q).
x - link(q,
p).
x) * asq);
99 atomicAdd(elems + tid +
blockSize * 3, (link(
p,
p).
y - link(q,q).
y) * asq);
107 Float a0sq = elems[threadIdx.x] * elems[threadIdx.x];
108 Float
x = (relax_boost * a0sq + asq) / (a0sq + asq);
109 Float r = rsqrt((a0sq +
x *
x * asq));
110 elems[threadIdx.x] *= r;
123 for (
int j = 0; j < NCOLORS; j++ ) {
125 link(
p,j) = complex<Float>( elems[tid], elems[tid +
blockSize * 3] ) * m0 + complex<Float>( elems[tid +
blockSize * 2], elems[tid +
blockSize] ) * link(q,j);
126 link(q,j) = complex<Float>(-elems[tid +
blockSize * 2], elems[tid +
blockSize]) * m0 + complex<Float>( elems[tid],-elems[tid +
blockSize * 3] ) * link(q,j);
135 for (
int j = 0; j < NCOLORS; j++ ) {
137 link(j,
p) = complex<Float>( elems[tid], -elems[tid +
blockSize * 3] ) * m0 + complex<Float>( elems[tid +
blockSize * 2], -elems[tid +
blockSize] ) * link(j,q);
138 link(j,q) = complex<Float>(-elems[tid +
blockSize * 2], -elems[tid +
blockSize]) * m0 + complex<Float>( elems[tid],elems[tid +
blockSize * 3] ) * link(j,q);
142 if (
block < (NCOLORS * (NCOLORS - 1) / 2) - 1 ) {
145 if ( threadIdx.x <
blockSize * 4 ) elems[threadIdx.x] = 0.0;
158 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
168 for (
int block = 0;
block < (NCOLORS * (NCOLORS - 1) / 2);
block++ ) {
171 IndexBlock<NCOLORS>(
block,
p, q);
180 if ( threadIdx.x <
blockSize * gauge_dir ) {
181 elems[threadIdx.x] = link(
p,
p).x + link(q,q).x;
182 elems[threadIdx.x +
blockSize * 8] = -(link(
p,q).y + link(q,
p).y);
183 elems[threadIdx.x +
blockSize * 8 * 2] = -(link(
p,q).x - link(q,
p).x);
184 elems[threadIdx.x +
blockSize * 8 * 3] = -(link(
p,
p).y - link(q,q).y);
187 elems[threadIdx.x] = link(
p,
p).x + link(q,q).x;
188 elems[threadIdx.x +
blockSize * 8] = (link(
p,q).y + link(q,
p).y);
189 elems[threadIdx.x +
blockSize * 8 * 2] = (link(
p,q).x - link(q,
p).x);
190 elems[threadIdx.x +
blockSize * 8 * 3] = (link(
p,
p).y - link(q,q).y);
195 Float a0,
a1,
a2, a3;
196 a0 = 0.0;
a1 = 0.0;
a2 = 0.0; a3 = 0.0;
198 for (
int i = 0;
i < gauge_dir;
i++ ) {
205 Float asq =
a1 *
a1 +
a2 *
a2 + a3 * a3;
206 Float a0sq = a0 * a0;
207 Float
x = (relax_boost * a0sq + asq) / (a0sq + asq);
208 Float r = rsqrt((a0sq +
x *
x * asq));
209 elems[threadIdx.x] = a0 * r;
212 elems[threadIdx.x +
blockSize * 3] = a3 *
x * r;
222 for (
int j = 0; j < NCOLORS; j++ ) {
224 link(
p,j) = complex<Float>( elems[tid], elems[tid +
blockSize * 3] ) * m0 + complex<Float>( elems[tid +
blockSize * 2], elems[tid +
blockSize] ) * link(q,j);
225 link(q,j) = complex<Float>(-elems[tid +
blockSize * 2], elems[tid +
blockSize]) * m0 + complex<Float>( elems[tid],-elems[tid +
blockSize * 3] ) * link(q,j);
234 for (
int j = 0; j < NCOLORS; j++ ) {
236 link(j,
p) = complex<Float>( elems[tid], -elems[tid +
blockSize * 3] ) * m0 + complex<Float>( elems[tid +
blockSize * 2], -elems[tid +
blockSize] ) * link(j,q);
237 link(j,q) = complex<Float>(-elems[tid +
blockSize * 2], -elems[tid +
blockSize]) * m0 + complex<Float>(elems[tid],elems[tid +
blockSize * 3] ) * link(j,q);
253 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
262 for (
int block = 0;
block < (NCOLORS * (NCOLORS - 1) / 2);
block++ ) {
265 IndexBlock<NCOLORS>(
block,
p, q);
268 elems[tid] = link(
p,
p).x + link(q,q).x;
269 elems[tid +
blockSize] = -(link(
p,q).y + link(q,
p).y);
270 elems[tid +
blockSize * 2] = -(link(
p,q).x - link(q,
p).x);
271 elems[tid +
blockSize * 3] = -(link(
p,
p).y - link(q,q).y);
275 elems[tid] += link(
p,
p).x + link(q,q).x;
276 elems[tid +
blockSize] -= (link(
p,q).y + link(q,
p).y);
277 elems[tid +
blockSize * 2] -= (link(
p,q).x - link(q,
p).x);
278 elems[tid +
blockSize * 3] -= (link(
p,
p).y - link(q,q).y);
282 elems[tid] += link(
p,
p).x + link(q,q).x;
283 elems[tid +
blockSize] -= (link(
p,q).y + link(q,
p).y);
284 elems[tid +
blockSize * 2] -= (link(
p,q).x - link(q,
p).x);
285 elems[tid +
blockSize * 3] -= (link(
p,
p).y - link(q,q).y);
287 if ( gauge_dir == 4 ) {
290 elems[tid] += link(
p,
p).x + link(q,q).x;
291 elems[tid +
blockSize] -= (link(
p,q).y + link(q,
p).y);
292 elems[tid +
blockSize * 2] -= (link(
p,q).x - link(q,
p).x);
293 elems[tid +
blockSize * 3] -= (link(
p,
p).y - link(q,q).y);
298 elems[tid] += link(
p,
p).x + link(q,q).x;
299 elems[tid +
blockSize] += (link(
p,q).y + link(q,
p).y);
300 elems[tid +
blockSize * 2] += (link(
p,q).x - link(q,
p).x);
301 elems[tid +
blockSize * 3] += (link(
p,
p).y - link(q,q).y);
305 elems[tid] += link(
p,
p).x + link(q,q).x;
306 elems[tid +
blockSize] += (link(
p,q).y + link(q,
p).y);
307 elems[tid +
blockSize * 2] += (link(
p,q).x - link(q,
p).x);
308 elems[tid +
blockSize * 3] += (link(
p,
p).y - link(q,q).y);
312 elems[tid] += link(
p,
p).x + link(q,q).x;
313 elems[tid +
blockSize] += (link(
p,q).y + link(q,
p).y);
314 elems[tid +
blockSize * 2] += (link(
p,q).x - link(q,
p).x);
315 elems[tid +
blockSize * 3] += (link(
p,
p).y - link(q,q).y);
317 if ( gauge_dir == 4 ) {
320 elems[tid] += link(
p,
p).x + link(q,q).x;
321 elems[tid +
blockSize] += (link(
p,q).y + link(q,
p).y);
322 elems[tid +
blockSize * 2] += (link(
p,q).x - link(q,
p).x);
323 elems[tid +
blockSize * 3] += (link(
p,
p).y - link(q,q).y);
332 Float a0sq = elems[tid] * elems[tid];
333 Float
x = (relax_boost * a0sq + asq) / (a0sq + asq);
334 Float r = rsqrt((a0sq +
x *
x * asq));
348 for (
int j = 0; j < NCOLORS; j++ ) {
350 link(
p,j) = complex<Float>( elems[tid], elems[tid +
blockSize * 3] ) * m0 + complex<Float>( elems[tid +
blockSize * 2], elems[tid +
blockSize] ) * link(q,j);
351 link(q,j) = complex<Float>(-elems[tid +
blockSize * 2], elems[tid +
blockSize]) * m0 + complex<Float>( elems[tid],-elems[tid +
blockSize * 3] ) * link(q,j);
360 for (
int j = 0; j < NCOLORS; j++ ) {
362 link(j,
p) = complex<Float>( elems[tid], -elems[tid +
blockSize * 3] ) * m0 + complex<Float>( elems[tid +
blockSize * 2], -elems[tid +
blockSize] ) * link(j,q);
363 link(j,q) = complex<Float>(-elems[tid +
blockSize * 2], -elems[tid +
blockSize]) * m0 + complex<Float>( elems[tid],elems[tid +
blockSize * 3] ) * link(j,q);
391 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
393 const Float relax_boost,
const int tid){
399 if ( threadIdx.x <
blockSize * 4 ) elems[threadIdx.x] = 0.0;
405 for (
int block = 0;
block < (NCOLORS * (NCOLORS - 1) / 2);
block++ ) {
408 IndexBlock<NCOLORS>(
block,
p, q);
409 if ( threadIdx.x <
blockSize * gauge_dir ) {
412 atomicAdd(elems + tid, (link1(
p,
p)).
x + (link1(q,q)).
x + (link(
p,
p)).
x + (link(q,q)).
x);
414 atomicAdd(elems + tid +
blockSize, (link1(
p,q).
y + link1(q,
p).
y) - (link(
p,q).
y + link(q,
p).
y));
416 atomicAdd(elems + tid +
blockSize * 2, (link1(
p,q).
x - link1(q,
p).
x) - (link(
p,q).
x - link(q,
p).
x));
418 atomicAdd(elems + tid +
blockSize * 3, (link1(
p,
p).
y - link1(q,q).
y) - (link(
p,
p).
y - link(q,q).
y));
426 Float a0sq = elems[threadIdx.x] * elems[threadIdx.x];
427 Float
x = (relax_boost * a0sq + asq) / (a0sq + asq);
428 Float r = rsqrt((a0sq +
x *
x * asq));
429 elems[threadIdx.x] *= r;
440 for (
int j = 0; j < NCOLORS; j++ ) {
442 link(
p,j) = complex<Float>( elems[tid], elems[tid +
blockSize * 3] ) * m0 +
444 link(q,j) = complex<Float>(-elems[tid +
blockSize * 2], elems[tid +
blockSize]) * m0 +
445 complex<Float>( elems[tid],-elems[tid +
blockSize * 3] ) * link(q,j);
451 for (
int j = 0; j < NCOLORS; j++ ) {
453 link1(j,
p) = complex<Float>( elems[tid], -elems[tid +
blockSize * 3] ) * m0 +
455 link1(j,q) = complex<Float>(-elems[tid +
blockSize * 2], -elems[tid +
blockSize]) * m0 +
456 complex<Float>( elems[tid],elems[tid +
blockSize * 3] ) * link1(j,q);
458 if (
block < (NCOLORS * (NCOLORS - 1) / 2) - 1 ) {
461 if ( threadIdx.x <
blockSize * 4 ) elems[threadIdx.x] = 0.0;
485 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
487 const Float relax_boost,
const int tid){
494 for (
int block = 0;
block < (NCOLORS * (NCOLORS - 1) / 2);
block++ ) {
497 IndexBlock<NCOLORS>(
block,
p, q);
498 if ( threadIdx.x <
blockSize * gauge_dir ) {
499 elems[threadIdx.x] = link1(
p,
p).x + link1(q,q).x + link(
p,
p).x + link(q,q).x;
500 elems[threadIdx.x +
blockSize * 4] = (link1(
p,q).y + link1(q,
p).y) - (link(
p,q).y + link(q,
p).y);
501 elems[threadIdx.x +
blockSize * 4 * 2] = (link1(
p,q).x - link1(q,
p).x) - (link(
p,q).x - link(q,
p).x);
502 elems[threadIdx.x +
blockSize * 4 * 3] = (link1(
p,
p).y - link1(q,q).y) - (link(
p,
p).y - link(q,q).y);
506 Float a0,
a1,
a2, a3;
507 a0 = 0.0;
a1 = 0.0;
a2 = 0.0; a3 = 0.0;
509 for (
int i = 0;
i < gauge_dir;
i++ ) {
516 Float asq =
a1 *
a1 +
a2 *
a2 + a3 * a3;
517 Float a0sq = a0 * a0;
518 Float
x = (relax_boost * a0sq + asq) / (a0sq + asq);
519 Float r = rsqrt((a0sq +
x *
x * asq));
520 elems[threadIdx.x] = a0 * r;
523 elems[threadIdx.x +
blockSize * 3] = a3 *
x * r;
531 for (
int j = 0; j < NCOLORS; j++ ) {
533 link(
p,j) = complex<Float>( elems[tid], elems[tid +
blockSize * 3] ) * m0 +
535 link(q,j) = complex<Float>(-elems[tid +
blockSize * 2], elems[tid +
blockSize]) * m0 +
536 complex<Float>( elems[tid],-elems[tid +
blockSize * 3] ) * link(q,j);
542 for (
int j = 0; j < NCOLORS; j++ ) {
544 link1(j,
p) = complex<Float>( elems[tid], -elems[tid +
blockSize * 3] ) * m0 +
546 link1(j,q) = complex<Float>(-elems[tid +
blockSize * 2], -elems[tid +
blockSize]) * m0 +
547 complex<Float>( elems[tid],elems[tid +
blockSize * 3] ) * link1(j,q);
562 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
571 for (
int block = 0;
block < (NCOLORS * (NCOLORS - 1) / 2);
block++ ) {
574 IndexBlock<NCOLORS>(
block,
p, q);
576 elems[tid] = link1(
p,
p).x + link1(q,q).x + link(
p,
p).x + link(q,q).x;
577 elems[tid +
blockSize] = (link1(
p,q).y + link1(q,
p).y) - (link(
p,q).y + link(q,
p).y);
578 elems[tid +
blockSize * 2] = (link1(
p,q).x - link1(q,
p).x) - (link(
p,q).x - link(q,
p).x);
579 elems[tid +
blockSize * 3] = (link1(
p,
p).y - link1(q,q).y) - (link(
p,
p).y - link(q,q).y);
583 elems[tid] += link1(
p,
p).x + link1(q,q).x + link(
p,
p).x + link(q,q).x;
584 elems[tid +
blockSize] += (link1(
p,q).y + link1(q,
p).y) - (link(
p,q).y + link(q,
p).y);
585 elems[tid +
blockSize * 2] += (link1(
p,q).x - link1(q,
p).x) - (link(
p,q).x - link(q,
p).x);
586 elems[tid +
blockSize * 3] += (link1(
p,
p).y - link1(q,q).y) - (link(
p,
p).y - link(q,q).y);
590 elems[tid] += link1(
p,
p).x + link1(q,q).x + link(
p,
p).x + link(q,q).x;
591 elems[tid +
blockSize] += (link1(
p,q).y + link1(q,
p).y) - (link(
p,q).y + link(q,
p).y);
592 elems[tid +
blockSize * 2] += (link1(
p,q).x - link1(q,
p).x) - (link(
p,q).x - link(q,
p).x);
593 elems[tid +
blockSize * 3] += (link1(
p,
p).y - link1(q,q).y) - (link(
p,
p).y - link(q,q).y);
595 if ( gauge_dir == 4 ) {
598 elems[tid] += link1(
p,
p).x + link1(q,q).x + link(
p,
p).x + link(q,q).x;
599 elems[tid +
blockSize] += (link1(
p,q).y + link1(q,
p).y) - (link(
p,q).y + link(q,
p).y);
600 elems[tid +
blockSize * 2] += (link1(
p,q).x - link1(q,
p).x) - (link(
p,q).x - link(q,
p).x);
601 elems[tid +
blockSize * 3] += (link1(
p,
p).y - link1(q,q).y) - (link(
p,
p).y - link(q,q).y);
609 Float a0sq = elems[tid] * elems[tid];
610 Float
x = (relax_boost * a0sq + asq) / (a0sq + asq);
611 Float r = rsqrt((a0sq +
x *
x * asq));
623 for (
int j = 0; j < NCOLORS; j++ ) {
625 link(
p,j) = complex<Float>( elems[tid], elems[tid +
blockSize * 3] ) * m0 +
627 link(q,j) = complex<Float>(-elems[tid +
blockSize * 2], elems[tid +
blockSize]) * m0 +
628 complex<Float>( elems[tid],-elems[tid +
blockSize * 3] ) * link(q,j);
634 for (
int j = 0; j < NCOLORS; j++ ) {
636 link1(j,
p) = complex<Float>( elems[tid], -elems[tid +
blockSize * 3] ) * m0 +
638 link1(j,q) = complex<Float>(-elems[tid +
blockSize * 2], -elems[tid +
blockSize]) * m0 +
639 complex<Float>( elems[tid],elems[tid +
blockSize * 3] ) * link1(j,q);
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
char * index(const char *, int)
static __inline__ size_t p
__forceinline__ __device__ void GaugeFixHit_NoAtomicAdd(Matrix< complex< Float >, NCOLORS > &link, const Float relax_boost, const int tid)
__forceinline__ __device__ void GaugeFixHit_NoAtomicAdd_LessSM(Matrix< complex< Float >, NCOLORS > &link, const Float relax_boost, const int tid)
__forceinline__ __device__ void GaugeFixHit_AtomicAdd(Matrix< complex< Float >, NCOLORS > &link, const Float relax_boost, const int tid)