6 return [complex(x)
for x
in a]
10 if a ==
int(a):
return `
int(a)`
14 if a == 0:
return "0i" 15 elif a == -1:
return "-i" 16 elif a == 1:
return "i" 17 else:
return fltToString(a)+
"i" 21 if re == 0
and im == 0:
return "0" 22 elif re == 0:
return imToString(im)
23 elif im == 0:
return fltToString(re)
25 im_str =
"-"+imToString(-im)
if im < 0
else "+"+imToString(im)
26 return fltToString(re)+im_str
75 return [x+y
for (x,y)
in zip(g1,g2)]
78 return [x-y
for (x,y)
in zip(g1,g2)]
98 def indentline(line):
return (
" "+line
if (line.count(
"#", 0, 1) == 0)
else line)
99 return ''.join([indentline(line)+
"\n" for line
in code.splitlines()])
102 return "{\n"+
indent(code)+
"}" 106 elif x==-1:
return "-" 107 elif x==+2:
return "+2*" 108 elif x==-2:
return "-2*" 111 return `(n/4)` +
"." + [
"x",
"y",
"z",
"w"][n%4]
114 return `(n/2)` +
"." + [
"x",
"y"][n%2]
117 def in_re(s, c):
return "i"+`s`+`c`+
"_re" 118 def in_im(s, c):
return "i"+`s`+`c`+
"_im" 119 def g_re(d, m, n):
return (
"g" if (d%2==0)
else "gT")+`m`+`n`+
"_re" 120 def g_im(d, m, n):
return (
"g" if (d%2==0)
else "gT")+`m`+`n`+
"_im" 121 def out_re(s, c):
return "o"+`s`+`c`+
"_re" 122 def out_im(s, c):
return "o"+`s`+`c`+
"_im" 123 def h1_re(h, c):
return [
"a",
"b"][h]+`c`+
"_re" 124 def h1_im(h, c):
return [
"a",
"b"][h]+`c`+
"_im" 125 def h2_re(h, c):
return [
"A",
"B"][h]+`c`+
"_re" 126 def h2_im(h, c):
return [
"A",
"B"][h]+`c`+
"_im" 127 def c_re(b, sm, cm, sn, cn):
return "c"+`(sm+2*b)`+`cm`+
"_"+`(sn+2*b)`+`cn`+
"_re" 128 def c_im(b, sm, cm, sn, cn):
return "c"+`(sm+2*b)`+`cm`+
"_"+`(sn+2*b)`+`cn`+
"_im" 129 def a_re(b, s, c):
return "a"+`(s+2*b)`+`c`+
"_re" 130 def a_im(b, s, c):
return "a"+`(s+2*b)`+`c`+
"_im" 132 def acc_re(s, c):
return "acc"+`s`+`c`+
"_re" 133 def acc_im(s, c):
return "acc"+`s`+`c`+
"_im" 135 def tmp_re(s, c):
return "tmp"+`s`+`c`+
"_re" 136 def tmp_im(s, c):
return "tmp"+`s`+`c`+
"_im" 139 if z==0:
return name+`s`+`c`+
"_re" 140 else:
return name+`s`+`c`+
"_im" 144 str +=
"// input spinor\n" 145 str +=
"#ifdef SPINOR_DOUBLE\n" 146 str +=
"#define spinorFloat double\n" 148 str +=
"#define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2\n" 149 str +=
"#define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2\n" 156 if dslash
and not pack:
163 str +=
"#define spinorFloat float\n" 165 str +=
"#define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4\n" 166 str +=
"#define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4\n" 172 if dslash
and not pack:
178 str +=
"#endif // SPINOR_DOUBLE\n\n" 184 str =
"// gauge link\n" 185 str +=
"#ifdef GAUGE_FLOAT2\n" 201 str +=
"#endif // GAUGE_DOUBLE\n\n" 203 str +=
"// conjugated gauge link\n" 207 str +=
"#define "+
g_re(1,m,n)+
" (+"+
g_re(0,n,m)+
")\n" 208 str +=
"#define "+
g_im(1,m,n)+
" (-"+
g_im(0,n,m)+
")\n" 216 str =
"// first chiral block of inverted clover term\n" 217 str +=
"#ifdef CLOVER_DOUBLE\n" 227 for m
in range(n+1,6):
231 str +=
"#define "+
c_im(0,sm,cm,sn,cn)+
" C"+
nthFloat2(i+1)+
"\n" 243 for m
in range(n+1,6):
247 str +=
"#define "+
c_im(0,sm,cm,sn,cn)+
" C"+
nthFloat4(i+1)+
"\n" 249 str +=
"#endif // CLOVER_DOUBLE\n\n" 257 str +=
"#define "+
c_re(0,sm,cm,sn,cn)+
" (+"+
c_re(0,sn,cn,sm,cm)+
")\n" 258 str +=
"#define "+
c_im(0,sm,cm,sn,cn)+
" (-"+
c_im(0,sn,cn,sm,cm)+
")\n" 261 str +=
"// second chiral block of inverted clover term (reuses C0,...,C9)\n" 268 str +=
"#define "+
c_re(1,sm,cm,sn,cn)+
" "+
c_re(0,sm,cm,sn,cn)+
"\n" 269 if m != n: str +=
"#define "+
c_im(1,sm,cm,sn,cn)+
" "+
c_im(0,sm,cm,sn,cn)+
"\n" 279 str =
"// output spinor\n" 283 if 2*i < sharedFloats
and not sharedDslash:
284 str +=
"#define "+
out_re(s,c)+
" s["+`(2*i+0)`+
"*SHARED_STRIDE]\n" 286 str +=
"VOLATILE spinorFloat "+
out_re(s,c)+
";\n" 287 if 2*i+1 < sharedFloats
and not sharedDslash:
288 str +=
"#define "+
out_im(s,c)+
" s["+`(2*i+1)`+
"*SHARED_STRIDE]\n" 290 str +=
"VOLATILE spinorFloat "+
out_im(s,c)+
";\n" 299 prolog_str= (
"// *** CUDA DSLASH ***\n\n" if not dagger
else "// *** CUDA DSLASH DAGGER ***\n\n")
300 prolog_str+=
"#define DSLASH_SHARED_FLOATS_PER_THREAD "+str(sharedFloats)+
"\n\n" 302 prolog_str= (
"// *** CUDA CLOVER ***\n\n")
303 prolog_str+=
"#define CLOVER_SHARED_FLOATS_PER_THREAD "+str(sharedFloats)+
"\n\n" 305 print "Undefined prolog" 310 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 312 #else // Open64 compiler 313 #define VOLATILE volatile 318 if dslash ==
True: prolog_str+=
def_gauge()
322 if (sharedFloats > 0):
327 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 329 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 336 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 338 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 344 if sharedFloats > 0
and not sharedDslash:
347 extern __shared__ char s_data[]; 353 VOLATILE spinorFloat *s = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 354 + (threadIdx.x % SHARED_STRIDE); 359 VOLATILE spinorFloat *s = (spinorFloat*)s_data + CLOVER_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 360 + (threadIdx.x % SHARED_STRIDE); 367 #include "read_gauge.h" 368 #include "read_clover.h" 369 #include "io_spinor.h" 382 if (kernel_type == INTERIOR_KERNEL) { 385 // Assume even dimensions 386 coordsFromIndex3D<EVEN_X>(X, x, sid, param.parity, param.dc.X); 388 // only need to check Y and Z dims currently since X and T set to match exactly 389 if (coord[1] >= param.dc.X[1]) return; 390 if (coord[2] >= param.dc.X[2]) return; 398 if (kernel_type == INTERIOR_KERNEL) { 401 sid = blockIdx.x*blockDim.x + threadIdx.x; 402 if (sid >= param.threads) return; 404 // Assume even dimensions 405 coordsFromIndex<4,QUDA_4D_PC,EVEN_X>(X, coord, sid, param); 419 } else { // exterior kernel 421 sid = blockIdx.x*blockDim.x + threadIdx.x; 422 if (sid >= param.threads) return; 424 const int face_volume = (param.threads >> 1); // volume of one face 425 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1 426 face_idx = sid - face_num*face_volume; // index into the respective face 428 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP) 429 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading) 430 //sp_idx = face_idx + param.ghostOffset[dim]; 432 coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param); 434 READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid); 444 prolog_str+=
"#endif // MULTI_GPU\n\n\n" 449 #include "read_clover.h" 450 #include "io_spinor.h" 452 int sid = blockIdx.x*blockDim.x + threadIdx.x; 453 if (sid >= param.threads) return; 455 // read spinor from device memory 456 READ_SPINOR(SPINORTEX, param.sp_stride, sid, sid); 462 def gen(dir, pack_only=False):
463 projIdx = dir
if not dagger
else dir + (1 - 2*(dir%2))
466 return projectors[projIdx][4*i+j]
473 return (1, proj(i,1))
475 return (0, proj(i,0))
477 boundary = [
"coord[0]==(param.dc.X[0]-1)",
"coord[0]==0",
"coord[1]==(param.dc.X[1]-1)",
"coord[1]==0",
"coord[2]==(param.dc.X[2]-1)",
"coord[2]==0",
"coord[3]==(param.dc.X[3]-1)",
"coord[3]==0"]
478 interior = [
"coord[0]<(param.dc.X[0]-1)",
"coord[0]>0",
"coord[1]<(param.dc.X[1]-1)",
"coord[1]>0",
"coord[2]<(param.dc.X[2]-1)",
"coord[2]>0",
"coord[3]<(param.dc.X[3]-1)",
"coord[3]>0"]
479 dim = [
"X",
"Y",
"Z",
"T"]
482 sp_idx = [
"X+1",
"X-1",
"X+param.dc.X[0]",
"X-param.dc.X[0]",
"X+param.dc.X2X1",
"X-param.dc.X2X1",
"X+param.dc.X3X2X1",
"X-param.dc.X3X2X1"]
485 sp_idx_wrap = [
"X-(param.dc.X[0]-1)",
"X+(param.dc.X[0]-1)",
"X-param.dc.X2X1mX1",
"X+param.dc.X2X1mX1",
"X-param.dc.X3X2X1mX2X1",
"X+param.dc.X3X2X1mX2X1",
486 "X-param.dc.X4X3X2X1mX3X2X1",
"X+param.dc.X4X3X2X1mX3X2X1"]
489 cond +=
"#ifdef MULTI_GPU\n" 490 cond +=
"if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim["+`dir/2`+
"] || "+interior[dir]+
")) ||\n" 491 cond +=
" (kernel_type == EXTERIOR_KERNEL_"+dim[dir/2]+
" && "+boundary[dir]+
") )\n" 496 projName =
"P"+`dir/2`+[
"-",
"+"][projIdx%2]
497 str +=
"// Projector "+projName+
"\n" 498 for l
in projStr.splitlines():
502 str +=
"#ifdef MULTI_GPU\n" 503 str +=
"const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? ("+boundary[dir]+
" ? "+sp_idx_wrap[dir]+
" : "+sp_idx[dir]+
") >> 1 :\n" 504 str +=
" face_idx + param.ghostOffset[static_cast<int>(kernel_type)][" + `(dir+1)%2` +
"];\n" 505 str +=
"#if (DD_PREC==2) // half precision\n" 506 str +=
"const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][" + `(dir+1)%2` +
"];\n" 509 str +=
"const int sp_idx = ("+boundary[dir]+
" ? "+sp_idx_wrap[dir]+
" : "+sp_idx[dir]+
") >> 1;\n" 514 str +=
"const int ga_idx = sid;\n" 516 str +=
"#ifdef MULTI_GPU\n" 517 str +=
"const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);\n" 519 str +=
"const int ga_idx = sp_idx;\n" 524 row_cnt = ([0,0,0,0])
529 if re != 0
or im != 0:
531 row_cnt[0] += row_cnt[1]
532 row_cnt[2] += row_cnt[3]
535 for h
in range(0, 2):
536 for c
in range(0, 3):
537 decl_half +=
"spinorFloat "+
h1_re(h,c)+
", "+
h1_im(h,c)+
";\n";
540 load_spinor =
"// read spinor from device memory\n" 542 load_spinor +=
"READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 543 elif row_cnt[2] == 0:
544 load_spinor +=
"READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 546 load_spinor +=
"READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 550 load_half +=
"const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];\n" 552 if dir >= 6: load_half +=
"const int t_proj_scale = TPROJSCALE;\n" 554 load_half +=
"// read half spinor from device memory\n" 558 load_half +=
"READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, "+`dir`+
");\n\n" 559 load_gauge =
"// read gauge matrix from device memory\n" 560 load_gauge +=
"READ_GAUGE_MATRIX(G, GAUGE"+`dir%2`+
"TEX, "+`dir`+
", ga_idx, param.gauge_stride);\n\n" 562 reconstruct_gauge =
"// reconstruct gauge matrix\n" 563 reconstruct_gauge +=
"RECONSTRUCT_GAUGE_MATRIX("+`dir`+
");\n\n" 565 project =
"// project spinor into half spinors\n" 566 for h
in range(0, 2):
567 for c
in range(0, 3):
570 for s
in range(0, 4):
573 if re==0
and im==0: ()
581 for s
in range(0, 4):
582 re = proj(h+2,s).real
583 im = proj(h+2,s).imag
584 if re==0
and im==0: ()
592 project +=
h1_re(h,c)+
" = "+strRe+
";\n" 593 project +=
h1_im(h,c)+
" = "+strIm+
";\n" 596 """// store spinor into shared memory 597 WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);\n 601 """// load spinor from shared memory 602 int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1; 604 READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);\n 608 """// load spinor from shared memory 609 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x; 610 int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0; 611 READ_SPINOR_SHARED(tx, ty, threadIdx.z);\n 615 """// load spinor from shared memory 616 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x; 617 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1; 618 READ_SPINOR_SHARED(tx, ty, threadIdx.z);\n 622 """// load spinor from shared memory 623 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x; 624 int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0; 625 READ_SPINOR_SHARED(tx, threadIdx.y, tz);\n 629 """// load spinor from shared memory 630 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x; 631 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1; 632 READ_SPINOR_SHARED(tx, threadIdx.y, tz);\n 637 for h
in range(0, 2):
638 for c
in range(0, 3):
639 copy_half +=
h1_re(h,c)+
" = "+(
"t_proj_scale*" if (dir >= 6)
else "")+
in_re(h,c)+
"; " 640 copy_half +=
h1_im(h,c)+
" = "+(
"t_proj_scale*" if (dir >= 6)
else "")+
in_im(h,c)+
";\n" 644 prep_half +=
"#ifdef MULTI_GPU\n" 645 prep_half +=
"if (kernel_type == INTERIOR_KERNEL) {\n" 646 prep_half +=
"#endif\n" 651 prep_half +=
indent(load_spinor)
652 prep_half +=
indent(write_shared)
653 prep_half +=
indent(project)
655 prep_half +=
indent(load_shared_1)
656 prep_half +=
indent(project)
658 prep_half +=
indent(
"if (threadIdx.y == blockDim.y-1 && blockDim.y < param.dc.X[1] ) {\n")
659 prep_half +=
indent(load_spinor)
660 prep_half +=
indent(project)
661 prep_half +=
indent(
"} else {")
662 prep_half +=
indent(load_shared_2)
663 prep_half +=
indent(project)
666 prep_half +=
indent(
"if (threadIdx.y == 0 && blockDim.y < param.dc.X[1]) {\n")
667 prep_half +=
indent(load_spinor)
668 prep_half +=
indent(project)
669 prep_half +=
indent(
"} else {")
670 prep_half +=
indent(load_shared_3)
671 prep_half +=
indent(project)
674 prep_half +=
indent(
"if (threadIdx.z == blockDim.z-1 && blockDim.z < param.dc.X[2]) {\n")
675 prep_half +=
indent(load_spinor)
676 prep_half +=
indent(project)
677 prep_half +=
indent(
"} else {")
678 prep_half +=
indent(load_shared_4)
679 prep_half +=
indent(project)
682 prep_half +=
indent(
"if (threadIdx.z == 0 && blockDim.z < param.dc.X[2]) {\n")
683 prep_half +=
indent(load_spinor)
684 prep_half +=
indent(project)
685 prep_half +=
indent(
"} else {")
686 prep_half +=
indent(load_shared_5)
687 prep_half +=
indent(project)
690 prep_half +=
indent(load_spinor)
691 prep_half +=
indent(project)
693 prep_half +=
indent(load_spinor)
694 prep_half +=
indent(project)
697 prep_half +=
"#ifdef MULTI_GPU\n" 698 prep_half +=
"} else {\n" 700 prep_half +=
indent(load_half)
701 prep_half +=
indent(copy_half)
703 prep_half +=
"#endif // MULTI_GPU\n" 706 ident =
"// identity gauge matrix\n" 709 ident +=
"spinorFloat "+
h2_re(h,m)+
" = " +
h1_re(h,m) +
"; " 710 ident +=
"spinorFloat "+
h2_im(h,m)+
" = " +
h1_im(h,m) +
";\n" 715 mult +=
"// multiply row "+`m`+
"\n" 717 re =
"spinorFloat "+
h2_re(h,m)+
" = 0;\n" 718 im =
"spinorFloat "+
h2_im(h,m)+
" = 0;\n" 720 re +=
h2_re(h,m) +
" += " +
g_re(dir,m,c) +
" * "+
h1_re(h,c)+
";\n" 721 re +=
h2_re(h,m) +
" -= " +
g_im(dir,m,c) +
" * "+
h1_im(h,c)+
";\n" 722 im +=
h2_im(h,m) +
" += " +
g_re(dir,m,c) +
" * "+
h1_im(h,c)+
";\n" 723 im +=
h2_im(h,m) +
" += " +
g_im(dir,m,c) +
" * "+
h1_re(h,c)+
";\n" 729 reconstruct +=
"#ifdef SPINOR_DOUBLE\n" 730 reconstruct +=
"spinorFloat a = param.a;\n" 731 reconstruct +=
"#else\n" 732 reconstruct +=
"spinorFloat a = param.a_f;\n" 733 reconstruct +=
"#endif\n" 742 reconstruct +=
out_re(h_out, m) +
" += " +
h2_re(h,m) +
";\n" 743 reconstruct +=
out_im(h_out, m) +
" += " +
h2_im(h,m) +
";\n" 745 reconstruct +=
out_re(h_out, m) +
" += a*" +
h2_re(h,m) +
";\n" 746 reconstruct +=
out_im(h_out, m) +
" += a*" +
h2_im(h,m) +
";\n" 753 if im == 0
and re == 0: ()
755 reconstruct +=
out_re(s, m) +
" " +
sign(re) +
"= " +
h2_re(h,m) +
";\n" 756 reconstruct +=
out_im(s, m) +
" " +
sign(re) +
"= " +
h2_im(h,m) +
";\n" 758 reconstruct +=
out_re(s, m) +
" " +
sign(-im) +
"= " +
h2_im(h,m) +
";\n" 759 reconstruct +=
out_im(s, m) +
" " +
sign(+im) +
"= " +
h2_re(h,m) +
";\n" 761 if im == 0
and re == 0: ()
763 reconstruct +=
out_re(s, m) +
" " +
sign(re) +
"= a*" +
h2_re(h,m) +
";\n" 764 reconstruct +=
out_im(s, m) +
" " +
sign(re) +
"= a*" +
h2_im(h,m) +
";\n" 766 reconstruct +=
out_re(s, m) +
" " +
sign(-im) +
"= a*" +
h2_im(h,m) +
";\n" 767 reconstruct +=
out_im(s, m) +
" " +
sign(+im) +
"= a*" +
h2_re(h,m) +
";\n" 772 str +=
"if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)\n" 773 str +=
block(decl_half + prep_half + ident + reconstruct)
775 str +=
block(decl_half + prep_half + load_gauge + reconstruct_gauge + mult + reconstruct)
777 str += decl_half + prep_half + load_gauge + reconstruct_gauge + mult + reconstruct
780 out = load_spinor + decl_half + project
781 out = out.replace(
"sp_idx",
"idx")
784 return cond +
block(str)+
"\n\n" 790 if z==0:
return out_re(s,c)
793 if z==0:
return in_re(s,c)
794 else:
return in_im(s,c)
798 str +=
"spinorFloat "+
a_re(0,0,c)+
" = -"+
spinor(v_in,1,c,0)+
" - "+
spinor(v_in,3,c,0)+
";\n" 799 str +=
"spinorFloat "+
a_im(0,0,c)+
" = -"+
spinor(v_in,1,c,1)+
" - "+
spinor(v_in,3,c,1)+
";\n" 800 str +=
"spinorFloat "+
a_re(0,1,c)+
" = "+
spinor(v_in,0,c,0)+
" + "+
spinor(v_in,2,c,0)+
";\n" 801 str +=
"spinorFloat "+
a_im(0,1,c)+
" = "+
spinor(v_in,0,c,1)+
" + "+
spinor(v_in,2,c,1)+
";\n" 802 str +=
"spinorFloat "+
a_re(0,2,c)+
" = -"+
spinor(v_in,1,c,0)+
" + "+
spinor(v_in,3,c,0)+
";\n" 803 str +=
"spinorFloat "+
a_im(0,2,c)+
" = -"+
spinor(v_in,1,c,1)+
" + "+
spinor(v_in,3,c,1)+
";\n" 804 str +=
"spinorFloat "+
a_re(0,3,c)+
" = "+
spinor(v_in,0,c,0)+
" - "+
spinor(v_in,2,c,0)+
";\n" 805 str +=
"spinorFloat "+
a_im(0,3,c)+
" = "+
spinor(v_in,0,c,1)+
" - "+
spinor(v_in,2,c,1)+
";\n" 808 for s
in range (0,4):
809 str +=
spinor(v_out,s,c,0)+
" = "+
a_re(0,s,c)+
"; " 810 str +=
spinor(v_out,s,c,1)+
" = "+
a_im(0,s,c)+
";\n" 812 return block(str)+
"\n\n" 817 str +=
"spinorFloat "+
a_re(0,0,c)+
" = "+
spinor(v_in,1,c,0)+
" + "+
spinor(v_in,3,c,0)+
";\n" 818 str +=
"spinorFloat "+
a_im(0,0,c)+
" = "+
spinor(v_in,1,c,1)+
" + "+
spinor(v_in,3,c,1)+
";\n" 819 str +=
"spinorFloat "+
a_re(0,1,c)+
" = -"+
spinor(v_in,0,c,0)+
" - "+
spinor(v_in,2,c,0)+
";\n" 820 str +=
"spinorFloat "+
a_im(0,1,c)+
" = -"+
spinor(v_in,0,c,1)+
" - "+
spinor(v_in,2,c,1)+
";\n" 821 str +=
"spinorFloat "+
a_re(0,2,c)+
" = "+
spinor(v_in,1,c,0)+
" - "+
spinor(v_in,3,c,0)+
";\n" 822 str +=
"spinorFloat "+
a_im(0,2,c)+
" = "+
spinor(v_in,1,c,1)+
" - "+
spinor(v_in,3,c,1)+
";\n" 823 str +=
"spinorFloat "+
a_re(0,3,c)+
" = -"+
spinor(v_in,0,c,0)+
" + "+
spinor(v_in,2,c,0)+
";\n" 824 str +=
"spinorFloat "+
a_im(0,3,c)+
" = -"+
spinor(v_in,0,c,1)+
" + "+
spinor(v_in,2,c,1)+
";\n" 827 for s
in range (0,4):
828 str +=
spinor(v_out,s,c,0)+
" = "+
a_re(0,s,c)+
"; " 829 str +=
spinor(v_out,s,c,1)+
" = "+
a_im(0,s,c)+
";\n" 831 return block(str)+
"\n\n" 836 str =
"READ_CLOVER(CLOVERTEX, "+`chi`+
")\n\n" 838 for s
in range (0,2):
839 for c
in range (0,3):
840 str +=
"spinorFloat "+
a_re(chi,s,c)+
" = 0; spinorFloat "+
a_im(chi,s,c)+
" = 0;\n" 843 for sm
in range (0,2):
844 for cm
in range (0,3):
845 for sn
in range (0,2):
846 for cn
in range (0,3):
847 str +=
a_re(chi,sm,cm)+
" += "+
c_re(chi,sm,cm,sn,cn)+
" * "+
spinor(v_in,2*chi+sn,cn,0)+
";\n" 848 if (sn != sm)
or (cn != cm):
849 str +=
a_re(chi,sm,cm)+
" -= "+
c_im(chi,sm,cm,sn,cn)+
" * "+
spinor(v_in,2*chi+sn,cn,1)+
";\n" 851 str +=
a_im(chi,sm,cm)+
" += "+
c_re(chi,sm,cm,sn,cn)+
" * "+
spinor(v_in,2*chi+sn,cn,1)+
";\n" 852 if (sn != sm)
or (cn != cm):
853 str +=
a_im(chi,sm,cm)+
" += "+
c_im(chi,sm,cm,sn,cn)+
" * "+
spinor(v_in,2*chi+sn,cn,0)+
";\n" 857 for s
in range (0,2):
858 for c
in range (0,3):
859 str +=
spinor(v_out,2*chi+s,c,0)+
" = "+
a_re(chi,s,c)+
"; " 860 str +=
spinor(v_out,2*chi+s,c,1)+
" = "+
a_im(chi,s,c)+
";\n" 863 return block(str)+
"\n\n" 869 if dslash: str +=
"#ifdef DSLASH_CLOVER\n\n" 870 str +=
"// change to chiral basis\n" 872 str +=
"// apply first chiral block\n" 874 str +=
"// apply second chiral block\n" 876 str +=
"// change back from chiral basis\n" 877 str +=
"// (note: required factor of 1/2 is included in clover term normalization)\n" 879 if dslash: str +=
"#endif // DSLASH_CLOVER\n\n" 886 str =
"// apply twisted mass rotation\n" 888 for h
in range(0, 4):
889 for c
in range(0, 3):
892 for s
in range(0, 4):
896 if re==0
and im==0: ()
905 re = igamma5[4*h+s].real
906 im = igamma5[4*h+s].imag
907 if re==0
and im==0: ()
915 str +=
"VOLATILE spinorFloat "+
tmp_re(h,c)+
" = " + strRe +
";\n" 916 str +=
"VOLATILE spinorFloat "+
tmp_im(h,c)+
" = " + strIm +
";\n" 926 str +=
"#ifndef DSLASH_XPAY\n" 927 str +=
"//scale by b = 1/(1 + a*a) \n" 937 str +=
"#endif // DSLASH_XPAY\n" 940 return block(str)+
"\n" 946 str +=
"#ifdef DSLASH_CLOVER_XPAY\n\n" 947 str +=
"READ_ACCUM(ACCUMTEX, param.sp_stride)\n\n" 957 str +=
"#endif // DSLASH_CLOVER_XPAY\n" 964 str +=
"#ifdef DSLASH_XPAY\n\n" 965 str +=
"READ_ACCUM(ACCUMTEX, param.sp_stride)\n\n" 967 str +=
"#ifdef SPINOR_DOUBLE\n" 968 str +=
"spinorFloat a = param.a;\n" 970 str +=
"spinorFloat a = param.a_f;\n" 983 str +=
"#endif // DSLASH_XPAY\n" 991 if dslash
and not asymClover:
993 str +=
"#ifdef MULTI_GPU\n" 995 str +=
"#if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER)\n" 998 int incomplete = 0; // Have all 8 contributions been computed for this site? 1000 switch(kernel_type) { // intentional fall-through 1001 case INTERIOR_KERNEL: 1002 incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1))); 1003 case EXTERIOR_KERNEL_T: 1004 incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1))); 1005 case EXTERIOR_KERNEL_Z: 1006 incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1))); 1007 case EXTERIOR_KERNEL_Y: 1008 incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1))); 1012 str +=
"if (!incomplete)\n" 1013 str +=
"#endif // MULTI_GPU\n" 1017 if twist: block_str +=
twisted()
1021 if not asymClover: block_str +=
xpay()
1023 str +=
block( block_str )
1026 str +=
"// write spinor field back to device memory\n" 1027 str +=
"WRITE_SPINOR(param.sp_stride);\n\n" 1029 str +=
"// undefine to prevent warning when precision is changed\n" 1030 str +=
"#undef spinorFloat\n" 1032 str +=
"#undef WRITE_SPINOR_SHARED\n" 1033 str +=
"#undef READ_SPINOR_SHARED\n" 1034 if sharedFloats > 0: str +=
"#undef SHARED_STRIDE\n\n" 1037 for m
in range(0,3):
1038 for n
in range(0,3):
1040 str +=
"#undef "+
g_re(0,m,n)+
"\n" 1041 str +=
"#undef "+
g_im(0,m,n)+
"\n" 1044 for s
in range(0,4):
1045 for c
in range(0,3):
1047 str +=
"#undef "+
in_re(s,c)+
"\n" 1048 str +=
"#undef "+
in_im(s,c)+
"\n" 1052 for s
in range(0,4):
1053 for c
in range(0,3):
1055 str +=
"#undef "+
acc_re(s,c)+
"\n" 1056 str +=
"#undef "+
acc_im(s,c)+
"\n" 1060 for m
in range(0,6):
1063 str +=
"#undef "+
c_re(0,s,c,s,c)+
"\n" 1064 for n
in range(0,6):
1067 for m
in range(n+1,6):
1070 str +=
"#undef "+
c_re(0,sm,cm,sn,cn)+
"\n" 1071 str +=
"#undef "+
c_im(0,sm,cm,sn,cn)+
"\n" 1074 for s
in range(0,4):
1075 for c
in range(0,3):
1077 if 2*i < sharedFloats:
1078 str +=
"#undef "+
out_re(s,c)+
"\n" 1079 if 2*i+1 < sharedFloats:
1080 str +=
"#undef "+
out_im(s,c)+
"\n" 1083 str +=
"#undef VOLATILE\n" 1091 str +=
"switch(dim) {\n" 1092 for dim
in range(0,4):
1093 str +=
"case "+`dim`+
":\n" 1094 proj =
gen(2*dim+facenum, pack_only=
True)
1096 proj +=
"// write half spinor back to device memory\n" 1097 proj +=
"WRITE_HALF_SPINOR(face_volume, face_idx);\n" 1104 assert (sharedFloats == 0)
1107 str +=
"#include \"io_spinor.h\"\n\n" 1109 str +=
"if (face_num) " 1115 str +=
"// undefine to prevent warning when precision is changed\n" 1116 str +=
"#undef spinorFloat\n" 1117 str +=
"#undef SHARED_STRIDE\n\n" 1119 for s
in range(0,4):
1120 for c
in range(0,3):
1122 str +=
"#undef "+
in_re(s,c)+
"\n" 1123 str +=
"#undef "+
in_im(s,c)+
"\n" 1138 print "Generating dslash kernel for sm" + str(arch/10)
1155 sharedDslash =
False 1159 sharedDslash =
False 1162 print "Shared floats set to " + str(sharedFloats)
1169 filename =
'dslash_core/wilson_dslash_' + name +
'_core.h' 1170 print sys.argv[0] +
": generating " + filename;
1171 f = open(filename,
'w')
1176 filename =
'dslash_core/wilson_dslash_dagger_' + name +
'_core.h' 1177 print sys.argv[0] +
": generating " + filename;
1178 f = open(filename,
'w')
1185 filename =
'dslash_core/asym_wilson_clover_dslash_' + name +
'_core.h' 1186 print sys.argv[0] +
": generating " + filename;
1187 f = open(filename,
'w')
1192 filename =
'dslash_core/asym_wilson_clover_dslash_dagger_' + name +
'_core.h' 1193 print sys.argv[0] +
": generating " + filename;
1194 f = open(filename,
'w')
1227 sharedDslash =
False 1244 print sys.argv[0] +
": generating wilson_pack_face_core.h";
1245 f = open(
'dslash_core/wilson_pack_face_core.h',
'w')
1250 print sys.argv[0] +
": generating wilson_pack_face_dagger_core.h";
1251 f = open(
'dslash_core/wilson_pack_face_dagger_core.h',
'w')
def generate_dslash_kernels(arch)
def c_im(b, sm, cm, sn, cn)
def indent(code)
code generation ######################################################################## ...
def input_spinor(s, c, z)
def apply_clover(v_out, v_in)
def c_re(b, sm, cm, sn, cn)
def complexify(a)
complex numbers ######################################################################## ...
def from_chiral_basis(v_out, v_in, c)
def gen(dir, pack_only=False)
def to_chiral_basis(v_out, v_in, c)
def spinor(name, s, c, z)
def clover_mult(v_out, v_in, chi)