1 #ifndef _GAUGE_FIX_OVR_HIT_DEVF_H 2 #define _GAUGE_FIX_OVR_HIT_DEVF_H 16 __device__
inline operator T*()
18 extern __shared__
int __smem[];
22 __device__
inline operator const T*()
const 24 extern __shared__
int __smem[];
36 static __host__ __device__
inline void IndexBlock(
int block,
int &p,
int &q){
38 if ( block == 0 ) { p = 0; q = 1; }
39 else if ( block == 1 ) { p = 1; q = 2; }
47 while ( del_i < (NCOLORS - 1) && found == 0 ) {
49 for ( i1 = 0; i1 < (NCOLORS - del_i); i1++ ) {
51 if ( index == block ) {
68 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
75 if ( threadIdx.x < blockSize * 4 ) elems[threadIdx.x] = 0.0;
81 for (
int block = 0; block < (NCOLORS * (NCOLORS - 1) / 2); block++ ) {
84 IndexBlock<NCOLORS>(block, p, q);
86 if ( threadIdx.x < blockSize * 4 ) asq = -1.0;
90 if ( threadIdx.x < blockSize * gauge_dir || (threadIdx.x >= blockSize * 4 && threadIdx.x < blockSize * (gauge_dir + 4))) {
93 atomicAdd(elems + tid, (link(p,p)).x + (link(q,q)).x);
95 atomicAdd(elems + tid + blockSize, (link(p,q).y + link(q,p).y) * asq);
97 atomicAdd(elems + tid + blockSize * 2, (link(p,q).x - link(q,p).x) * asq);
99 atomicAdd(elems + tid + blockSize * 3, (link(p,p).y - link(q,q).y) * asq);
102 if ( threadIdx.x < blockSize ) {
104 asq = elems[threadIdx.x + blockSize] * elems[threadIdx.x + blockSize];
105 asq += elems[threadIdx.x + blockSize * 2] * elems[threadIdx.x + blockSize * 2];
106 asq += elems[threadIdx.x + blockSize * 3] * elems[threadIdx.x + blockSize * 3];
107 Float a0sq = elems[threadIdx.x] * elems[threadIdx.x];
108 Float x = (relax_boost * a0sq + asq) / (a0sq + asq);
109 Float r = rsqrt((a0sq + x * x * asq));
110 elems[threadIdx.x] *= r;
111 elems[threadIdx.x + blockSize] *= x * r;
112 elems[threadIdx.x + blockSize * 2] *= x * r;
113 elems[threadIdx.x + blockSize * 3] *= x * r;
117 if ( threadIdx.x < blockSize * 4 ) {
123 for (
int j = 0; j < NCOLORS; j++ ) {
125 link(p,j) = complex<Float>( elems[tid], elems[tid + blockSize * 3] ) * m0 + complex<Float>( elems[tid + blockSize * 2], elems[tid + blockSize] ) * link(q,j);
126 link(q,j) = complex<Float>(-elems[tid + blockSize * 2], elems[tid + blockSize]) * m0 + complex<Float>( elems[tid],-elems[tid + blockSize * 3] ) * link(q,j);
135 for (
int j = 0; j < NCOLORS; j++ ) {
137 link(j,p) = complex<Float>( elems[tid], -elems[tid + blockSize * 3] ) * m0 + complex<Float>( elems[tid + blockSize * 2], -elems[tid + blockSize] ) * link(j,q);
138 link(j,q) = complex<Float>(-elems[tid + blockSize * 2], -elems[tid + blockSize]) * m0 + complex<Float>( elems[tid],elems[tid + blockSize * 3] ) * link(j,q);
142 if ( block < (NCOLORS * (NCOLORS - 1) / 2) - 1 ) {
145 if ( threadIdx.x < blockSize * 4 ) elems[threadIdx.x] = 0.0;
158 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
168 for (
int block = 0; block < (NCOLORS * (NCOLORS - 1) / 2); block++ ) {
171 IndexBlock<NCOLORS>(block, p, q);
180 if ( threadIdx.x < blockSize * gauge_dir ) {
181 elems[threadIdx.x] = link(p,p).x + link(q,q).x;
182 elems[threadIdx.x + blockSize * 8] = -(link(p,q).y + link(q,p).y);
183 elems[threadIdx.x + blockSize * 8 * 2] = -(link(p,q).x - link(q,p).x);
184 elems[threadIdx.x + blockSize * 8 * 3] = -(link(p,p).y - link(q,q).y);
186 if ((threadIdx.x >= blockSize * 4 && threadIdx.x < blockSize * (gauge_dir + 4))) {
187 elems[threadIdx.x] = link(p,p).x + link(q,q).x;
188 elems[threadIdx.x + blockSize * 8] = (link(p,q).y + link(q,p).y);
189 elems[threadIdx.x + blockSize * 8 * 2] = (link(p,q).x - link(q,p).x);
190 elems[threadIdx.x + blockSize * 8 * 3] = (link(p,p).y - link(q,q).y);
194 if ( threadIdx.x < blockSize ) {
195 Float a0, a1, a2, a3;
196 a0 = 0.0; a1 = 0.0; a2 = 0.0; a3 = 0.0;
198 for (
int i = 0; i < gauge_dir; i++ ) {
199 a0 += elems[tid + i * blockSize] + elems[tid + (i + 4) * blockSize];
200 a1 += elems[tid + i * blockSize + blockSize * 8] + elems[tid + (i + 4) * blockSize + blockSize * 8];
201 a2 += elems[tid + i * blockSize + blockSize * 8 * 2] + elems[tid + (i + 4) * blockSize + blockSize * 8 * 2];
202 a3 += elems[tid + i * blockSize + blockSize * 8 * 3] + elems[tid + (i + 4) * blockSize + blockSize * 8 * 3];
205 Float asq = a1 * a1 + a2 * a2 + a3 * a3;
206 Float a0sq = a0 * a0;
207 Float x = (relax_boost * a0sq + asq) / (a0sq + asq);
208 Float r = rsqrt((a0sq + x * x * asq));
209 elems[threadIdx.x] = a0 * r;
210 elems[threadIdx.x + blockSize] = a1 * x * r;
211 elems[threadIdx.x + blockSize * 2] = a2 * x * r;
212 elems[threadIdx.x + blockSize * 3] = a3 * x * r;
216 if ( threadIdx.x < blockSize * 4 ) {
222 for (
int j = 0; j < NCOLORS; j++ ) {
224 link(p,j) = complex<Float>( elems[tid], elems[tid + blockSize * 3] ) * m0 + complex<Float>( elems[tid + blockSize * 2], elems[tid + blockSize] ) * link(q,j);
225 link(q,j) = complex<Float>(-elems[tid + blockSize * 2], elems[tid + blockSize]) * m0 + complex<Float>( elems[tid],-elems[tid + blockSize * 3] ) * link(q,j);
234 for (
int j = 0; j < NCOLORS; j++ ) {
236 link(j,p) = complex<Float>( elems[tid], -elems[tid + blockSize * 3] ) * m0 + complex<Float>( elems[tid + blockSize * 2], -elems[tid + blockSize] ) * link(j,q);
237 link(j,q) = complex<Float>(-elems[tid + blockSize * 2], -elems[tid + blockSize]) * m0 + complex<Float>(elems[tid],elems[tid + blockSize * 3] ) * link(j,q);
241 if ( block < (NCOLORS * (NCOLORS - 1) / 2) - 1 ) { __syncthreads(); }
253 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
262 for (
int block = 0; block < (NCOLORS * (NCOLORS - 1) / 2); block++ ) {
265 IndexBlock<NCOLORS>(block, p, q);
267 if ( threadIdx.x < blockSize ) {
268 elems[tid] = link(p,p).x + link(q,q).x;
269 elems[tid + blockSize] = -(link(p,q).y + link(q,p).y);
270 elems[tid + blockSize * 2] = -(link(p,q).x - link(q,p).x);
271 elems[tid + blockSize * 3] = -(link(p,p).y - link(q,q).y);
274 if ( threadIdx.x < blockSize * 2 && threadIdx.x >= blockSize ) {
275 elems[tid] += link(p,p).x + link(q,q).x;
276 elems[tid + blockSize] -= (link(p,q).y + link(q,p).y);
277 elems[tid + blockSize * 2] -= (link(p,q).x - link(q,p).x);
278 elems[tid + blockSize * 3] -= (link(p,p).y - link(q,q).y);
281 if ( threadIdx.x < blockSize * 3 && threadIdx.x >= blockSize * 2 ) {
282 elems[tid] += link(p,p).x + link(q,q).x;
283 elems[tid + blockSize] -= (link(p,q).y + link(q,p).y);
284 elems[tid + blockSize * 2] -= (link(p,q).x - link(q,p).x);
285 elems[tid + blockSize * 3] -= (link(p,p).y - link(q,q).y);
287 if ( gauge_dir == 4 ) {
289 if ( threadIdx.x < blockSize * 4 && threadIdx.x >= blockSize * 3 ) {
290 elems[tid] += link(p,p).x + link(q,q).x;
291 elems[tid + blockSize] -= (link(p,q).y + link(q,p).y);
292 elems[tid + blockSize * 2] -= (link(p,q).x - link(q,p).x);
293 elems[tid + blockSize * 3] -= (link(p,p).y - link(q,q).y);
297 if ( threadIdx.x < blockSize * 5 && threadIdx.x >= blockSize * 4 ) {
298 elems[tid] += link(p,p).x + link(q,q).x;
299 elems[tid + blockSize] += (link(p,q).y + link(q,p).y);
300 elems[tid + blockSize * 2] += (link(p,q).x - link(q,p).x);
301 elems[tid + blockSize * 3] += (link(p,p).y - link(q,q).y);
304 if ( threadIdx.x < blockSize * 6 && threadIdx.x >= blockSize * 5 ) {
305 elems[tid] += link(p,p).x + link(q,q).x;
306 elems[tid + blockSize] += (link(p,q).y + link(q,p).y);
307 elems[tid + blockSize * 2] += (link(p,q).x - link(q,p).x);
308 elems[tid + blockSize * 3] += (link(p,p).y - link(q,q).y);
311 if ( threadIdx.x < blockSize * 7 && threadIdx.x >= blockSize * 6 ) {
312 elems[tid] += link(p,p).x + link(q,q).x;
313 elems[tid + blockSize] += (link(p,q).y + link(q,p).y);
314 elems[tid + blockSize * 2] += (link(p,q).x - link(q,p).x);
315 elems[tid + blockSize * 3] += (link(p,p).y - link(q,q).y);
317 if ( gauge_dir == 4 ) {
319 if ( threadIdx.x < blockSize * 8 && threadIdx.x >= blockSize * 7 ) {
320 elems[tid] += link(p,p).x + link(q,q).x;
321 elems[tid + blockSize] += (link(p,q).y + link(q,p).y);
322 elems[tid + blockSize * 2] += (link(p,q).x - link(q,p).x);
323 elems[tid + blockSize * 3] += (link(p,p).y - link(q,q).y);
328 if ( threadIdx.x < blockSize ) {
329 Float asq = elems[tid + blockSize] * elems[tid + blockSize];
330 asq += elems[tid + blockSize * 2] * elems[tid + blockSize * 2];
331 asq += elems[tid + blockSize * 3] * elems[tid + blockSize * 3];
332 Float a0sq = elems[tid] * elems[tid];
333 Float x = (relax_boost * a0sq + asq) / (a0sq + asq);
334 Float r = rsqrt((a0sq + x * x * asq));
336 elems[tid + blockSize] *= x * r;
337 elems[tid + blockSize * 2] *= x * r;
338 elems[tid + blockSize * 3] *= x * r;
342 if ( threadIdx.x < blockSize * 4 ) {
348 for (
int j = 0; j < NCOLORS; j++ ) {
350 link(p,j) = complex<Float>( elems[tid], elems[tid + blockSize * 3] ) * m0 + complex<Float>( elems[tid + blockSize * 2], elems[tid + blockSize] ) * link(q,j);
351 link(q,j) = complex<Float>(-elems[tid + blockSize * 2], elems[tid + blockSize]) * m0 + complex<Float>( elems[tid],-elems[tid + blockSize * 3] ) * link(q,j);
360 for (
int j = 0; j < NCOLORS; j++ ) {
362 link(j,p) = complex<Float>( elems[tid], -elems[tid + blockSize * 3] ) * m0 + complex<Float>( elems[tid + blockSize * 2], -elems[tid + blockSize] ) * link(j,q);
363 link(j,q) = complex<Float>(-elems[tid + blockSize * 2], -elems[tid + blockSize]) * m0 + complex<Float>( elems[tid],elems[tid + blockSize * 3] ) * link(j,q);
367 if ( block < (NCOLORS * (NCOLORS - 1) / 2) - 1 ) { __syncthreads(); }
391 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
393 const Float relax_boost,
const int tid){
399 if ( threadIdx.x < blockSize * 4 ) elems[threadIdx.x] = 0.0;
405 for (
int block = 0; block < (NCOLORS * (NCOLORS - 1) / 2); block++ ) {
408 IndexBlock<NCOLORS>(block, p, q);
409 if ( threadIdx.x < blockSize * gauge_dir ) {
412 atomicAdd(elems + tid, (link1(p,p)).x + (link1(q,q)).x + (link(p,p)).x + (link(q,q)).x);
414 atomicAdd(elems + tid + blockSize, (link1(p,q).y + link1(q,p).y) - (link(p,q).y + link(q,p).y));
416 atomicAdd(elems + tid + blockSize * 2, (link1(p,q).x - link1(q,p).x) - (link(p,q).x - link(q,p).x));
418 atomicAdd(elems + tid + blockSize * 3, (link1(p,p).y - link1(q,q).y) - (link(p,p).y - link(q,q).y));
421 if ( threadIdx.x < blockSize ) {
423 Float asq = elems[threadIdx.x + blockSize] * elems[threadIdx.x + blockSize];
424 asq += elems[threadIdx.x + blockSize * 2] * elems[threadIdx.x + blockSize * 2];
425 asq += elems[threadIdx.x + blockSize * 3] * elems[threadIdx.x + blockSize * 3];
426 Float a0sq = elems[threadIdx.x] * elems[threadIdx.x];
427 Float x = (relax_boost * a0sq + asq) / (a0sq + asq);
428 Float r = rsqrt((a0sq + x * x * asq));
429 elems[threadIdx.x] *= r;
430 elems[threadIdx.x + blockSize] *= x * r;
431 elems[threadIdx.x + blockSize * 2] *= x * r;
432 elems[threadIdx.x + blockSize * 3] *= x * r;
440 for (
int j = 0; j < NCOLORS; j++ ) {
442 link(p,j) = complex<Float>( elems[tid], elems[tid + blockSize * 3] ) * m0 +
443 complex<Float>( elems[tid + blockSize * 2], elems[tid + blockSize] ) * link(q,j);
444 link(q,j) = complex<Float>(-elems[tid + blockSize * 2], elems[tid + blockSize]) * m0 +
445 complex<Float>( elems[tid],-elems[tid + blockSize * 3] ) * link(q,j);
451 for (
int j = 0; j < NCOLORS; j++ ) {
453 link1(j,p) = complex<Float>( elems[tid], -elems[tid + blockSize * 3] ) * m0 +
454 complex<Float>( elems[tid + blockSize * 2], -elems[tid + blockSize] ) * link1(j,q);
455 link1(j,q) = complex<Float>(-elems[tid + blockSize * 2], -elems[tid + blockSize]) * m0 +
456 complex<Float>( elems[tid],elems[tid + blockSize * 3] ) * link1(j,q);
458 if ( block < (NCOLORS * (NCOLORS - 1) / 2) - 1 ) {
461 if ( threadIdx.x < blockSize * 4 ) elems[threadIdx.x] = 0.0;
485 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
487 const Float relax_boost,
const int tid){
494 for (
int block = 0; block < (NCOLORS * (NCOLORS - 1) / 2); block++ ) {
497 IndexBlock<NCOLORS>(block, p, q);
498 if ( threadIdx.x < blockSize * gauge_dir ) {
499 elems[threadIdx.x] = link1(p,p).x + link1(q,q).x + link(p,p).x + link(q,q).x;
500 elems[threadIdx.x + blockSize * 4] = (link1(p,q).y + link1(q,p).y) - (link(p,q).y + link(q,p).y);
501 elems[threadIdx.x + blockSize * 4 * 2] = (link1(p,q).x - link1(q,p).x) - (link(p,q).x - link(q,p).x);
502 elems[threadIdx.x + blockSize * 4 * 3] = (link1(p,p).y - link1(q,q).y) - (link(p,p).y - link(q,q).y);
505 if ( threadIdx.x < blockSize ) {
506 Float a0, a1, a2, a3;
507 a0 = 0.0; a1 = 0.0; a2 = 0.0; a3 = 0.0;
509 for (
int i = 0; i < gauge_dir; i++ ) {
510 a0 += elems[tid + i * blockSize];
511 a1 += elems[tid + i * blockSize + blockSize * 4];
512 a2 += elems[tid + i * blockSize + blockSize * 4 * 2];
513 a3 += elems[tid + i * blockSize + blockSize * 4 * 3];
516 Float asq = a1 * a1 + a2 * a2 + a3 * a3;
517 Float a0sq = a0 * a0;
518 Float x = (relax_boost * a0sq + asq) / (a0sq + asq);
519 Float r = rsqrt((a0sq + x * x * asq));
520 elems[threadIdx.x] = a0 * r;
521 elems[threadIdx.x + blockSize] = a1 * x * r;
522 elems[threadIdx.x + blockSize * 2] = a2 * x * r;
523 elems[threadIdx.x + blockSize * 3] = a3 * x * r;
531 for (
int j = 0; j < NCOLORS; j++ ) {
533 link(p,j) = complex<Float>( elems[tid], elems[tid + blockSize * 3] ) * m0 +
534 complex<Float>( elems[tid + blockSize * 2], elems[tid + blockSize] ) * link(q,j);
535 link(q,j) = complex<Float>(-elems[tid + blockSize * 2], elems[tid + blockSize]) * m0 +
536 complex<Float>( elems[tid],-elems[tid + blockSize * 3] ) * link(q,j);
542 for (
int j = 0; j < NCOLORS; j++ ) {
544 link1(j,p) = complex<Float>( elems[tid], -elems[tid + blockSize * 3] ) * m0 +
545 complex<Float>( elems[tid + blockSize * 2], -elems[tid + blockSize] ) * link1(j,q);
546 link1(j,q) = complex<Float>(-elems[tid + blockSize * 2], -elems[tid + blockSize]) * m0 +
547 complex<Float>( elems[tid],elems[tid + blockSize * 3] ) * link1(j,q);
549 if ( block < (NCOLORS * (NCOLORS - 1) / 2) - 1 ) { __syncthreads(); }
562 template<
int blockSize,
typename Float,
int gauge_dir,
int NCOLORS>
571 for (
int block = 0; block < (NCOLORS * (NCOLORS - 1) / 2); block++ ) {
574 IndexBlock<NCOLORS>(block, p, q);
575 if ( threadIdx.x < blockSize ) {
576 elems[tid] = link1(p,p).x + link1(q,q).x + link(p,p).x + link(q,q).x;
577 elems[tid + blockSize] = (link1(p,q).y + link1(q,p).y) - (link(p,q).y + link(q,p).y);
578 elems[tid + blockSize * 2] = (link1(p,q).x - link1(q,p).x) - (link(p,q).x - link(q,p).x);
579 elems[tid + blockSize * 3] = (link1(p,p).y - link1(q,q).y) - (link(p,p).y - link(q,q).y);
582 if ( threadIdx.x < blockSize * 2 && threadIdx.x >= blockSize ) {
583 elems[tid] += link1(p,p).x + link1(q,q).x + link(p,p).x + link(q,q).x;
584 elems[tid + blockSize] += (link1(p,q).y + link1(q,p).y) - (link(p,q).y + link(q,p).y);
585 elems[tid + blockSize * 2] += (link1(p,q).x - link1(q,p).x) - (link(p,q).x - link(q,p).x);
586 elems[tid + blockSize * 3] += (link1(p,p).y - link1(q,q).y) - (link(p,p).y - link(q,q).y);
589 if ( threadIdx.x < blockSize * 3 && threadIdx.x >= blockSize * 2 ) {
590 elems[tid] += link1(p,p).x + link1(q,q).x + link(p,p).x + link(q,q).x;
591 elems[tid + blockSize] += (link1(p,q).y + link1(q,p).y) - (link(p,q).y + link(q,p).y);
592 elems[tid + blockSize * 2] += (link1(p,q).x - link1(q,p).x) - (link(p,q).x - link(q,p).x);
593 elems[tid + blockSize * 3] += (link1(p,p).y - link1(q,q).y) - (link(p,p).y - link(q,q).y);
595 if ( gauge_dir == 4 ) {
597 if ( threadIdx.x < blockSize * 4 && threadIdx.x >= blockSize * 3 ) {
598 elems[tid] += link1(p,p).x + link1(q,q).x + link(p,p).x + link(q,q).x;
599 elems[tid + blockSize] += (link1(p,q).y + link1(q,p).y) - (link(p,q).y + link(q,p).y);
600 elems[tid + blockSize * 2] += (link1(p,q).x - link1(q,p).x) - (link(p,q).x - link(q,p).x);
601 elems[tid + blockSize * 3] += (link1(p,p).y - link1(q,q).y) - (link(p,p).y - link(q,q).y);
605 if ( threadIdx.x < blockSize ) {
606 Float asq = elems[tid + blockSize] * elems[tid + blockSize];
607 asq += elems[tid + blockSize * 2] * elems[tid + blockSize * 2];
608 asq += elems[tid + blockSize * 3] * elems[tid + blockSize * 3];
609 Float a0sq = elems[tid] * elems[tid];
610 Float x = (relax_boost * a0sq + asq) / (a0sq + asq);
611 Float r = rsqrt((a0sq + x * x * asq));
613 elems[tid + blockSize] *= x * r;
614 elems[tid + blockSize * 2] *= x * r;
615 elems[tid + blockSize * 3] *= x * r;
623 for (
int j = 0; j < NCOLORS; j++ ) {
625 link(p,j) = complex<Float>( elems[tid], elems[tid + blockSize * 3] ) * m0 +
626 complex<Float>( elems[tid + blockSize * 2], elems[tid + blockSize] ) * link(q,j);
627 link(q,j) = complex<Float>(-elems[tid + blockSize * 2], elems[tid + blockSize]) * m0 +
628 complex<Float>( elems[tid],-elems[tid + blockSize * 3] ) * link(q,j);
634 for (
int j = 0; j < NCOLORS; j++ ) {
636 link1(j,p) = complex<Float>( elems[tid], -elems[tid + blockSize * 3] ) * m0 +
637 complex<Float>( elems[tid + blockSize * 2], -elems[tid + blockSize] ) * link1(j,q);
638 link1(j,q) = complex<Float>(-elems[tid + blockSize * 2], -elems[tid + blockSize]) * m0 +
639 complex<Float>( elems[tid],elems[tid + blockSize * 3] ) * link1(j,q);
641 if ( block < (NCOLORS * (NCOLORS - 1) / 2) - 1 ) { __syncthreads(); }
static __host__ __device__ void IndexBlock(int block, int &p, int &q)
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
__forceinline__ __device__ void GaugeFixHit_NoAtomicAdd(Matrix< complex< Float >, NCOLORS > &link, const Float relax_boost, const int tid)
__forceinline__ __device__ void GaugeFixHit_NoAtomicAdd_LessSM(Matrix< complex< Float >, NCOLORS > &link, const Float relax_boost, const int tid)
__forceinline__ __device__ void GaugeFixHit_AtomicAdd(Matrix< complex< Float >, NCOLORS > &link, const Float relax_boost, const int tid)
static int index(int ndim, const int *dims, const int *x)