6 return [complex(x)
for x
in a]
10 if a ==
int(a):
return `
int(a)`
14 if a == 0:
return "0i" 15 elif a == -1:
return "-i" 16 elif a == 1:
return "i" 17 else:
return fltToString(a)+
"i" 21 if re == 0
and im == 0:
return "0" 22 elif re == 0:
return imToString(im)
23 elif im == 0:
return fltToString(re)
25 im_str =
"-"+imToString(-im)
if im < 0
else "+"+imToString(im)
26 return fltToString(re)+im_str
75 return [x+y
for (x,y)
in zip(g1,g2)]
78 return [x-y
for (x,y)
in zip(g1,g2)]
98 def indentline(line):
return (
" "+line
if (line.count(
"#", 0, 1) == 0)
else line)
99 return ''.join([indentline(line)+
"\n" for line
in code.splitlines()])
102 return "{\n"+
indent(code)+
"}" 106 elif x==-1:
return "-" 107 elif x==+2:
return "+2*" 108 elif x==-2:
return "-2*" 111 return `(n/4)` +
"." + [
"x",
"y",
"z",
"w"][n%4]
114 return `(n/2)` +
"." + [
"x",
"y"][n%2]
117 def in_re(s, c):
return "i"+`s`+`c`+
"_re" 118 def in_im(s, c):
return "i"+`s`+`c`+
"_im" 119 def g_re(d, m, n):
return (
"g" if (d%2==0)
else "gT")+`m`+`n`+
"_re" 120 def g_im(d, m, n):
return (
"g" if (d%2==0)
else "gT")+`m`+`n`+
"_im" 121 def out_re(s, c):
return "o"+`s`+`c`+
"_re" 122 def out_im(s, c):
return "o"+`s`+`c`+
"_im" 123 def h1_re(h, c):
return [
"a",
"b"][h]+`c`+
"_re" 124 def h1_im(h, c):
return [
"a",
"b"][h]+`c`+
"_im" 125 def h2_re(h, c):
return [
"A",
"B"][h]+`c`+
"_re" 126 def h2_im(h, c):
return [
"A",
"B"][h]+`c`+
"_im" 127 def c_re(b, sm, cm, sn, cn):
return "c"+`(sm+2*b)`+`cm`+
"_"+`(sn+2*b)`+`cn`+
"_re" 128 def c_im(b, sm, cm, sn, cn):
return "c"+`(sm+2*b)`+`cm`+
"_"+`(sn+2*b)`+`cn`+
"_im" 129 def cinv_re(b, sm, cm, sn, cn):
return "cinv"+`(sm+2*b)`+`cm`+
"_"+`(sn+2*b)`+`cn`+
"_re" 130 def cinv_im(b, sm, cm, sn, cn):
return "cinv"+`(sm+2*b)`+`cm`+
"_"+`(sn+2*b)`+`cn`+
"_im" 131 def a_re(b, s, c):
return "a"+`(s+2*b)`+`c`+
"_re" 132 def a_im(b, s, c):
return "a"+`(s+2*b)`+`c`+
"_im" 134 def acc_re(s, c):
return "acc"+`s`+`c`+
"_re" 135 def acc_im(s, c):
return "acc"+`s`+`c`+
"_im" 137 def tmp_re(s, c):
return "tmp"+`s`+`c`+
"_re" 138 def tmp_im(s, c):
return "tmp"+`s`+`c`+
"_im" 141 if z==0:
return name+`s`+`c`+
"_re" 142 else:
return name+`s`+`c`+
"_im" 146 str +=
"// input spinor\n" 147 str +=
"#ifdef SPINOR_DOUBLE\n" 148 str +=
"#define spinorFloat double\n" 150 str +=
"#define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2\n" 151 str +=
"#define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2\n" 158 if dslash
and not pack:
165 str +=
"#define spinorFloat float\n" 167 str +=
"#define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4\n" 168 str +=
"#define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4\n" 174 if dslash
and not pack:
180 str +=
"#endif // SPINOR_DOUBLE\n\n" 186 str =
"// gauge link\n" 187 str +=
"#ifdef GAUGE_FLOAT2\n" 203 str +=
"#endif // GAUGE_DOUBLE\n\n" 205 str +=
"// conjugated gauge link\n" 209 str +=
"#define "+
g_re(1,m,n)+
" (+"+
g_re(0,n,m)+
")\n" 210 str +=
"#define "+
g_im(1,m,n)+
" (-"+
g_im(0,n,m)+
")\n" 217 str =
"// first chiral block of clover term\n" 218 str +=
"#ifdef CLOVER_DOUBLE\n" 228 for m
in range(n+1,6):
232 str +=
"#define "+
c_im(0,sm,cm,sn,cn)+
" C"+
nthFloat2(i+1)+
"\n" 244 for m
in range(n+1,6):
248 str +=
"#define "+
c_im(0,sm,cm,sn,cn)+
" C"+
nthFloat4(i+1)+
"\n" 250 str +=
"#endif // CLOVER_DOUBLE\n\n" 258 str +=
"#define "+
c_re(0,sm,cm,sn,cn)+
" (+"+
c_re(0,sn,cn,sm,cm)+
")\n" 259 str +=
"#define "+
c_im(0,sm,cm,sn,cn)+
" (-"+
c_im(0,sn,cn,sm,cm)+
")\n" 262 str +=
"// second chiral block of clover term (reuses C0,...,C9)\n" 269 str +=
"#define "+
c_re(1,sm,cm,sn,cn)+
" "+
c_re(0,sm,cm,sn,cn)+
"\n" 270 if m != n: str +=
"#define "+
c_im(1,sm,cm,sn,cn)+
" "+
c_im(0,sm,cm,sn,cn)+
"\n" 274 str +=
"// first chiral block of inverted clover term\n" 275 str +=
"#ifdef CLOVER_DOUBLE\n" 285 for m
in range(n+1,6):
301 for m
in range(n+1,6):
307 str +=
"#endif // CLOVER_DOUBLE\n\n" 315 str +=
"#define "+
cinv_re(0,sm,cm,sn,cn)+
" (+"+
cinv_re(0,sn,cn,sm,cm)+
")\n" 316 str +=
"#define "+
cinv_im(0,sm,cm,sn,cn)+
" (-"+
cinv_im(0,sn,cn,sm,cm)+
")\n" 319 str +=
"// second chiral block of inverted clover term (reuses C0,...,C9)\n" 326 str +=
"#define "+
cinv_re(1,sm,cm,sn,cn)+
" "+
cinv_re(0,sm,cm,sn,cn)+
"\n" 327 if m != n: str +=
"#define "+
cinv_im(1,sm,cm,sn,cn)+
" "+
cinv_im(0,sm,cm,sn,cn)+
"\n" 329 if dagger
and not pack_only:
330 str +=
"#ifndef CLOVER_TWIST_INV_DSLASH\n" 334 // declare C## here and use ASSN below instead of READ 369 #endif // CLOVER_DOUBLE 371 if dagger
and not pack_only:
381 str =
"// output spinor\n" 385 if 2*i < sharedFloats
and not sharedDslash:
386 str +=
"#define "+
out_re(s,c)+
" s["+`(2*i+0)`+
"*SHARED_STRIDE]\n" 388 str +=
"VOLATILE spinorFloat "+
out_re(s,c)+
";\n" 389 if 2*i+1 < sharedFloats
and not sharedDslash:
390 str +=
"#define "+
out_im(s,c)+
" s["+`(2*i+1)`+
"*SHARED_STRIDE]\n" 392 str +=
"VOLATILE spinorFloat "+
out_im(s,c)+
";\n" 400 prolog_str = (
"#ifdef MULTI_GPU\n\n")
403 prolog_str+= (
"// *** CUDA DSLASH ***\n\n" if not dagger
else "// *** CUDA DSLASH DAGGER ***\n\n")
404 prolog_str+=
"#define DSLASH_SHARED_FLOATS_PER_THREAD "+str(sharedFloats)+
"\n\n" 406 print "Undefined prolog" 411 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 413 #else // Open64 compiler 414 #define VOLATILE volatile 419 if dslash ==
True: prolog_str+=
def_gauge()
423 if (sharedFloats > 0):
428 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 430 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 437 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 439 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 445 if sharedFloats > 0
and not sharedDslash:
448 extern __shared__ char s_data[]; 454 VOLATILE spinorFloat *s = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 455 + (threadIdx.x % SHARED_STRIDE); 462 #include "read_gauge.h" 463 #include "io_spinor.h" 464 #include "read_clover.h" 465 #include "tmc_core.h" 470 #if (DD_PREC==2) // half precision 472 #endif // half precision 479 sid = blockIdx.x*blockDim.x + threadIdx.x; 480 if (sid >= param.threads) return; 483 int dim = dimFromFaceIndex(sid, param); // sid is also modified 485 const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim]) >> 1); 486 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1 487 int face_idx = sid - face_num*face_volume; // index into the respective face 491 coordsFromFaceIndex<4,QUDA_4D_PC,0,1>(X, sid, coord, face_idx, face_num, param); 494 coordsFromFaceIndex<4,QUDA_4D_PC,1,1>(X, sid, coord, face_idx, face_num, param); 497 coordsFromFaceIndex<4,QUDA_4D_PC,2,1>(X, sid, coord, face_idx, face_num, param); 500 coordsFromFaceIndex<4,QUDA_4D_PC,3,1>(X, sid, coord, face_idx, face_num, param); 506 for(int dir=0; dir<4; ++dir){ 507 active = active || isActive(dim,dir,+1,coord,param.commDim,param.dc.X); 512 READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid); 528 def gen(dir, pack_only=False):
529 projIdx = dir
if not dagger
else dir + (1 - 2*(dir%2))
532 return projectors[projIdx][4*i+j]
539 return (1, proj(i,1))
541 return (0, proj(i,0))
543 boundary = [
"coord[0]==(param.dc.X[0]-1)",
"coord[0]==0",
"coord[1]==(param.dc.X[1]-1)",
"coord[1]==0",
"coord[2]==(param.dc.X[2]-1)",
"coord[2]==0",
"coord[3]==(param.dc.X[3]-1)",
"coord[3]==0"]
544 interior = [
"coord[0]<(param.dc.X[0]-1)",
"coord[0]>0",
"coord[1]<(param.dc.X[1]-1)",
"coord[1]>0",
"coord[2]<(param.dc.X[2]-1)",
"coord[2]>0",
"coord[3]<(param.dc.X[3]-1)",
"coord[3]>0"]
545 offset = [
"+1",
"-1",
"+1",
"-1",
"+1",
"-1",
"+1",
"-1"];
546 dim = [
"X",
"Y",
"Z",
"T"]
549 sp_idx = [
"X+1",
"X-1",
"X+param.dc.X[0]",
"X-param.dc.X[0]",
"X+param.dc.X2X1",
"X-param.dc.X2X1",
"X+param.dc.X3X2X1",
"X-param.dc.X3X2X1"]
552 sp_idx_wrap = [
"X-(param.dc.X[0]-1)",
"X+(param.dc.X[0]-1)",
"X-param.dc.X2X1mX1",
"X+param.dc.X2X1mX1",
"X-param.dc.X3X2X1mX2X1",
"X+param.dc.X3X2X1mX2X1",
553 "X-param.dc.X4X3X2X1mX3X2X1",
"X+param.dc.X4X3X2X1mX3X2X1"]
557 cond +=
"if (isActive(dim," + `dir/2` +
"," + offset[dir] +
",coord,param.commDim,param.dc.X) && " +boundary[dir]+
" )\n" 562 projName =
"P"+`dir/2`+[
"-",
"+"][projIdx%2]
563 str +=
"// Projector "+projName+
"\n" 564 for l
in projStr.splitlines():
568 str +=
"faceIndexFromCoords<4,1>(face_idx,coord," + `dir/2` +
",param);\n" 569 str +=
"const int sp_idx = face_idx + param.ghostOffset[" + `dir/2` +
"][" + `1-dir%2` +
"];\n" 571 str +=
"#if (DD_PREC==2)\n" 572 str +=
" sp_norm_idx = face_idx + " 573 str +=
"param.ghostNormOffset[" + `dir/2` +
"][" + `1-dir%2` +
"];\n" 581 str +=
"const int ga_idx = sid;\n" 583 str +=
"const int ga_idx = param.dc.Vh+face_idx;\n" 587 row_cnt = ([0,0,0,0])
592 if re != 0
or im != 0:
594 row_cnt[0] += row_cnt[1]
595 row_cnt[2] += row_cnt[3]
598 for h
in range(0, 2):
599 for c
in range(0, 3):
600 decl_half +=
"spinorFloat "+
h1_re(h,c)+
", "+
h1_im(h,c)+
";\n";
603 load_spinor =
"// read spinor from device memory\n" 607 load_spinor +=
"#ifndef CLOVER_TWIST_INV_DSLASH\n" 608 load_spinor +=
"READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 609 load_spinor +=
"#else\n" 610 load_spinor +=
"READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 611 load_spinor +=
"#ifndef DYNAMIC_CLOVER\n" 612 load_spinor +=
"APPLY_CLOVER_TWIST_INV(c, cinv, a, i);\n" 613 load_spinor +=
"#else\n" 614 load_spinor +=
"APPLY_CLOVER_TWIST_DYN_INV(c, a, i);\n" 615 load_spinor +=
"#endif\n" 618 load_spinor +=
"READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 620 load_spinor +=
"READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 621 load_spinor +=
"#ifndef DYNAMIC_CLOVER\n" 622 load_spinor +=
"APPLY_CLOVER_TWIST_INV(c, cinv, -a, i);\n" 623 load_spinor +=
"#else\n" 624 load_spinor +=
"APPLY_CLOVER_TWIST_DYN_INV(c, -a, i);\n" 625 load_spinor +=
"#endif\n" 626 if not pack_only
and not dagger:
627 load_spinor +=
"#endif\n" 628 elif row_cnt[2] == 0:
631 load_spinor +=
"#ifndef CLOVER_TWIST_INV_DSLASH\n" 632 load_spinor +=
"READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 633 load_spinor +=
"#else\n" 634 load_spinor +=
"READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 635 load_spinor +=
"#ifndef DYNAMIC_CLOVER\n" 636 load_spinor +=
"APPLY_CLOVER_TWIST_INV(c, cinv, a, i);\n" 637 load_spinor +=
"#else\n" 638 load_spinor +=
"APPLY_CLOVER_TWIST_DYN_INV(c, a, i);\n" 639 load_spinor +=
"#endif\n" 642 load_spinor +=
"READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 644 load_spinor +=
"READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 645 load_spinor +=
"#ifndef DYNAMIC_CLOVER\n" 646 load_spinor +=
"APPLY_CLOVER_TWIST_INV(c, cinv, -a, i);\n" 647 load_spinor +=
"#else\n" 648 load_spinor +=
"APPLY_CLOVER_TWIST_DYN_INV(c, -a, i);\n" 649 load_spinor +=
"#endif\n" 650 if not pack_only
and not dagger:
651 load_spinor +=
"#endif\n" 653 load_spinor +=
"READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 654 if not dagger
and not pack_only:
655 load_spinor +=
"#ifdef CLOVER_TWIST_INV_DSLASH\n" 656 load_spinor +=
"#ifndef DYNAMIC_CLOVER\n" 657 load_spinor +=
"APPLY_CLOVER_TWIST_INV(c, cinv, a, i);\n" 658 load_spinor +=
"#else\n" 659 load_spinor +=
"APPLY_CLOVER_TWIST_DYN_INV(c, a, i);\n" 660 load_spinor +=
"#endif\n" 661 load_spinor +=
"#endif\n" 664 load_spinor +=
"#ifndef DYNAMIC_CLOVER\n" 665 load_spinor +=
"APPLY_CLOVER_TWIST_INV(c, cinv, a, i);\n" 666 load_spinor +=
"#else\n" 667 load_spinor +=
"APPLY_CLOVER_TWIST_DYN_INV(c, a, i);\n" 668 load_spinor +=
"#endif\n" 670 load_spinor +=
"#ifndef DYNAMIC_CLOVER\n" 671 load_spinor +=
"APPLY_CLOVER_TWIST_INV(c, cinv, -a, i);\n" 672 load_spinor +=
"#else\n" 673 load_spinor +=
"APPLY_CLOVER_TWIST_DYN_INV(c, -a, i);\n" 674 load_spinor +=
"#endif\n" 678 load_half +=
"const int sp_stride_pad = param.dc.ghostFace[" + `dir/2` +
"];\n" 684 load_half +=
"const int t_proj_scale = TPROJSCALE;\n" 687 load_half +=
"// read half spinor from device memory\n" 691 load_half +=
"READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, "+`dir`+
");\n\n" 694 load_gauge =
"// read gauge matrix from device memory\n" 695 load_gauge +=
"READ_GAUGE_MATRIX(G, GAUGE"+`dir%2`+
"TEX, "+`dir`+
", ga_idx, param.gauge_stride);\n\n" 697 reconstruct_gauge =
"// reconstruct gauge matrix\n" 698 reconstruct_gauge +=
"RECONSTRUCT_GAUGE_MATRIX("+`dir`+
");\n\n" 700 project =
"// project spinor into half spinors\n" 701 for h
in range(0, 2):
702 for c
in range(0, 3):
705 for s
in range(0, 4):
708 if re==0
and im==0: ()
716 for s
in range(0, 4):
717 re = proj(h+2,s).real
718 im = proj(h+2,s).imag
719 if re==0
and im==0: ()
727 project +=
h1_re(h,c)+
" = "+strRe+
";\n" 728 project +=
h1_im(h,c)+
" = "+strIm+
";\n" 731 """// store spinor into shared memory 732 WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);\n 736 """// load spinor from shared memory 737 int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1; 739 READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);\n 743 """// load spinor from shared memory 744 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x; 745 int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0; 746 READ_SPINOR_SHARED(tx, ty, threadIdx.z);\n 750 """// load spinor from shared memory 751 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x; 752 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1; 753 READ_SPINOR_SHARED(tx, ty, threadIdx.z);\n 757 """// load spinor from shared memory 758 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x; 759 int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0; 760 READ_SPINOR_SHARED(tx, threadIdx.y, tz);\n 764 """// load spinor from shared memory 765 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x; 766 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1; 767 READ_SPINOR_SHARED(tx, threadIdx.y, tz);\n 773 for h
in range(0, 2):
774 for c
in range(0, 3):
776 copy_half +=
h1_im(h,c)+
" = "+
in_im(h,c)+
";\n" 778 for h
in range(0, 2):
779 for c
in range(0, 3):
780 copy_half +=
h1_re(h,c)+
" = t_proj_scale*"+
in_re(h,c)+
"; " 781 copy_half +=
h1_im(h,c)+
" = t_proj_scale*"+
in_im(h,c)+
";\n" 786 prep_half += load_half
787 prep_half += copy_half
789 ident =
"// identity gauge matrix\n" 792 ident +=
"spinorFloat "+
h2_re(h,m)+
" = " +
h1_re(h,m) +
"; " 793 ident +=
"spinorFloat "+
h2_im(h,m)+
" = " +
h1_im(h,m) +
";\n" 798 mult +=
"// multiply row "+`m`+
"\n" 800 re =
"spinorFloat "+
h2_re(h,m)+
" = 0;\n" 801 im =
"spinorFloat "+
h2_im(h,m)+
" = 0;\n" 803 re +=
h2_re(h,m) +
" += " +
g_re(dir,m,c) +
" * "+
h1_re(h,c)+
";\n" 804 re +=
h2_re(h,m) +
" -= " +
g_im(dir,m,c) +
" * "+
h1_im(h,c)+
";\n" 805 im +=
h2_im(h,m) +
" += " +
g_re(dir,m,c) +
" * "+
h1_im(h,c)+
";\n" 806 im +=
h2_im(h,m) +
" += " +
g_im(dir,m,c) +
" * "+
h1_re(h,c)+
";\n" 818 reconstruct +=
out_re(h_out, m) +
" += " +
h2_re(h,m) +
";\n" 819 reconstruct +=
out_im(h_out, m) +
" += " +
h2_im(h,m) +
";\n" 825 if im == 0
and re == 0: ()
827 reconstruct +=
out_re(s, m) +
" " +
sign(re) +
"= " +
h2_re(h,m) +
";\n" 828 reconstruct +=
out_im(s, m) +
" " +
sign(re) +
"= " +
h2_im(h,m) +
";\n" 830 reconstruct +=
out_re(s, m) +
" " +
sign(-im) +
"= " +
h2_im(h,m) +
";\n" 831 reconstruct +=
out_im(s, m) +
" " +
sign(+im) +
"= " +
h2_re(h,m) +
";\n" 836 str +=
"if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)\n" 837 str +=
block(decl_half + prep_half + ident + reconstruct)
839 str +=
block(decl_half + prep_half + load_gauge + reconstruct_gauge + mult + reconstruct)
841 str += decl_half + prep_half + load_gauge + reconstruct_gauge + mult + reconstruct
844 out = load_spinor + decl_half + project
845 out = out.replace(
"sp_idx",
"idx")
848 return cond +
block(str)+
"\n\n" 854 if z==0:
return out_re(s,c)
857 if z==0:
return in_re(s,c)
858 else:
return in_im(s,c)
864 str +=
"#if !defined(CLOVER_TWIST_INV_DSLASH)\n" 865 str +=
"#ifdef SPINOR_DOUBLE\n" 866 str +=
"spinorFloat a = param.a;\n" 868 str +=
"spinorFloat a = param.a_f;\n" 872 str +=
"#ifdef DSLASH_XPAY\n" 874 str +=
"#ifdef SPINOR_DOUBLE\n" 875 str +=
"spinorFloat b = param.b;\n" 877 str +=
"spinorFloat b = param.b_f;\n" 880 str +=
"READ_ACCUM(ACCUMTEX, param.sp_stride)\n\n" 882 str +=
"#ifndef CLOVER_TWIST_XPAY\n" 883 str +=
"//perform invert twist first:\n" 884 str +=
"#ifndef DYNAMIC_CLOVER\n" 885 str +=
"APPLY_CLOVER_TWIST_INV(c, cinv, a, o);\n" 887 str +=
"APPLY_CLOVER_TWIST_DYN_INV(c, a, o);\n" 895 str +=
"APPLY_CLOVER_TWIST(c, a, acc);\n" 901 str +=
"#endif//CLOVER_TWIST_XPAY\n" 902 str +=
"#else //no XPAY\n" 903 str +=
"#ifndef DYNAMIC_CLOVER\n" 904 str +=
"APPLY_CLOVER_TWIST_INV(c, cinv, a, o);\n" 906 str +=
"APPLY_CLOVER_TWIST_DYN_INV(c, a, o);\n" 910 str +=
"#ifndef CLOVER_TWIST_INV_DSLASH\n" 911 str +=
"#ifndef CLOVER_TWIST_XPAY\n" 912 str +=
"//perform invert twist first:\n" 913 str +=
"#ifndef DYNAMIC_CLOVER\n" 914 str +=
"APPLY_CLOVER_TWIST_INV(c, cinv, -a, o);\n" 916 str +=
"APPLY_CLOVER_TWIST_DYN_INV(c, -a, o);\n" 919 str +=
"APPLY_CLOVER_TWIST(c, -a, acc);\n" 927 str +=
"#else //no XPAY\n" 928 str +=
"#ifndef CLOVER_TWIST_INV_DSLASH\n" 929 str +=
"#ifndef DYNAMIC_CLOVER\n" 930 str +=
"APPLY_CLOVER_TWIST_INV(c, cinv, -a, o);\n" 932 str +=
"APPLY_CLOVER_TWIST_DYN_INV(c, -a, o);\n" 944 str +=
block( block_str )
947 str +=
"// write spinor field back to device memory\n" 948 str +=
"WRITE_SPINOR(param.sp_stride);\n\n" 950 str +=
"// undefine to prevent warning when precision is changed\n" 951 str +=
"#undef spinorFloat\n" 953 str +=
"#undef WRITE_SPINOR_SHARED\n" 954 str +=
"#undef READ_SPINOR_SHARED\n" 955 if sharedFloats > 0: str +=
"#undef SHARED_STRIDE\n\n" 961 str +=
"#undef "+
g_re(0,m,n)+
"\n" 962 str +=
"#undef "+
g_im(0,m,n)+
"\n" 968 str +=
"#undef "+
in_re(s,c)+
"\n" 969 str +=
"#undef "+
in_im(s,c)+
"\n" 976 str +=
"#undef "+
acc_re(s,c)+
"\n" 977 str +=
"#undef "+
acc_im(s,c)+
"\n" 985 if 2*i < sharedFloats:
986 str +=
"#undef "+
out_re(s,c)+
"\n" 987 if 2*i+1 < sharedFloats:
988 str +=
"#undef "+
out_im(s,c)+
"\n" 994 str +=
"#undef "+
c_re(0,s,c,s,c)+
"\n" 998 for m
in range(n+1,6):
1001 str +=
"#undef "+
c_re(0,sm,cm,sn,cn)+
"\n" 1002 str +=
"#undef "+
c_im(0,sm,cm,sn,cn)+
"\n" 1005 for m
in range(0,6):
1008 str +=
"#undef "+
cinv_re(0,s,c,s,c)+
"\n" 1009 for n
in range(0,6):
1012 for m
in range(n+1,6):
1015 str +=
"#undef "+
cinv_re(0,sm,cm,sn,cn)+
"\n" 1016 str +=
"#undef "+
cinv_im(0,sm,cm,sn,cn)+
"\n" 1019 str +=
"#undef VOLATILE\n\n" 1020 str +=
"#endif // MULTI_GPU\n" 1028 str +=
"switch(dim) {\n" 1029 for dim
in range(0,4):
1030 str +=
"case "+`dim`+
":\n" 1031 proj =
gen(2*dim+facenum, pack_only=
True)
1033 proj +=
"// write half spinor back to device memory\n" 1034 proj +=
"WRITE_HALF_SPINOR(face_volume, face_idx);\n" 1041 assert (sharedFloats == 0)
1045 str +=
"#include \"io_spinor.h\"\n\n" 1046 str +=
"#include \"read_clover.h\"\n\n" 1047 str +=
"#include \"tmc_core.h\"\n\n" 1049 str +=
"if (face_num) " 1055 str +=
"// undefine to prevent warning when precision is changed\n" 1056 str +=
"#undef spinorFloat\n" 1057 str +=
"#undef SHARED_STRIDE\n\n" 1059 for s
in range(0,4):
1060 for c
in range(0,3):
1062 str +=
"#undef "+
in_re(s,c)+
"\n" 1063 str +=
"#undef "+
in_im(s,c)+
"\n" 1066 for m
in range(0,6):
1069 str +=
"#undef "+
c_re(0,s,c,s,c)+
"\n" 1070 for n
in range(0,6):
1073 for m
in range(n+1,6):
1076 str +=
"#undef "+
c_re(0,sm,cm,sn,cn)+
"\n" 1077 str +=
"#undef "+
c_im(0,sm,cm,sn,cn)+
"\n" 1080 for m
in range(0,6):
1083 str +=
"#undef "+
cinv_re(0,s,c,s,c)+
"\n" 1084 for n
in range(0,6):
1087 for m
in range(n+1,6):
1090 str +=
"#undef "+
cinv_re(0,sm,cm,sn,cn)+
"\n" 1091 str +=
"#undef "+
cinv_im(0,sm,cm,sn,cn)+
"\n" 1103 print "Generating dslash kernel for sm" + str(arch/10)
1119 sharedDslash =
False 1123 sharedDslash =
False 1126 print "Shared floats set to " + str(sharedFloats)
1131 filename =
'dslash_core/tmc_fused_exterior_dslash_' + name +
'_core.h' 1132 print sys.argv[0] +
": generating " + filename;
1133 f = open(filename,
'w')
1138 filename =
'dslash_core/tmc_fused_exterior_dslash_dagger_' + name +
'_core.h' 1139 print sys.argv[0] +
": generating " + filename +
"\n";
1140 f = open(filename,
'w')
1153 sharedDslash =
False
def clover_twisted_xpay()
def generate_dslash_kernels(arch)
def input_spinor(s, c, z)
def spinor(name, s, c, z)
def gen(dir, pack_only=False)
def cinv_re(b, sm, cm, sn, cn)
def c_im(b, sm, cm, sn, cn)
def cinv_im(b, sm, cm, sn, cn)
def indent(code)
code generation ######################################################################## ...
def def_clover(pack_only=False)
def complexify(a)
complex numbers ######################################################################## ...
def c_re(b, sm, cm, sn, cn)