7 return [complex(x)
for x
in a]
11 if a ==
int(a):
return `
int(a)`
15 if a == 0:
return "0i" 16 elif a == -1:
return "-i" 17 elif a == 1:
return "i" 18 else:
return fltToString(a)+
"i" 22 if re == 0
and im == 0:
return "0" 23 elif re == 0:
return imToString(im)
24 elif im == 0:
return fltToString(re)
26 im_str =
"-"+imToString(-im)
if im < 0
else "+"+imToString(im)
27 return fltToString(re)+im_str
76 return [x+y
for (x,y)
in zip(g1,g2)]
79 return [x-y
for (x,y)
in zip(g1,g2)]
99 def indentline(line):
return (
" "+line
if (line.count(
"#", 0, 1) == 0)
else line)
100 return ''.join([indentline(line)+
"\n" for line
in code.splitlines()])
103 return "{\n"+
indent(code)+
"}" 107 elif x==-1:
return "-" 108 elif x==+2:
return "+2*" 109 elif x==-2:
return "-2*" 112 return `(n/4)` +
"." + [
"x",
"y",
"z",
"w"][n%4]
115 return `(n/2)` +
"." + [
"x",
"y"][n%2]
118 def in_re(s, c):
return "i"+`s`+`c`+
"_re" 119 def in_im(s, c):
return "i"+`s`+`c`+
"_im" 120 def g_re(d, m, n):
return (
"g" if (d%2==0)
else "gT")+`m`+`n`+
"_re" 121 def g_im(d, m, n):
return (
"g" if (d%2==0)
else "gT")+`m`+`n`+
"_im" 126 def h1_re(h, c):
return [
"a",
"b"][h]+`c`+
"_re" 127 def h1_im(h, c):
return [
"a",
"b"][h]+`c`+
"_im" 128 def h2_re(h, c):
return [
"A",
"B"][h]+`c`+
"_re" 129 def h2_im(h, c):
return [
"A",
"B"][h]+`c`+
"_im" 130 def a_re(b, s, c):
return "a"+`(s+2*b)`+`c`+
"_re" 131 def a_im(b, s, c):
return "a"+`(s+2*b)`+`c`+
"_im" 133 def tmp_re(s, c):
return "tmp"+`s`+`c`+
"_re" 134 def tmp_im(s, c):
return "tmp"+`s`+`c`+
"_im" 136 def acc_re(s, c):
return "acc_"+`s`+`c`+
"_re" 137 def acc_im(s, c):
return "acc_"+`s`+`c`+
"_im" 138 def acc1_re(s, c):
return "acc1_"+`s`+`c`+
"_re" 139 def acc1_im(s, c):
return "acc1_"+`s`+`c`+
"_im" 140 def acc2_re(s, c):
return "acc2_"+`s`+`c`+
"_re" 141 def acc2_im(s, c):
return "acc2_"+`s`+`c`+
"_im" 146 str +=
"// input spinor\n" 147 str +=
"#ifdef SPINOR_DOUBLE\n" 148 str +=
"#define spinorFloat double\n" 150 str +=
"#define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2\n" 151 str +=
"#define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2\n" 159 str +=
"#define spinorFloat float\n" 161 str +=
"#define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4\n" 162 str +=
"#define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4\n" 168 str +=
"#endif // SPINOR_DOUBLE\n\n" 174 str =
"// gauge link\n" 175 str +=
"#ifdef GAUGE_FLOAT2\n" 191 str +=
"#endif // GAUGE_DOUBLE\n\n" 193 str +=
"// conjugated gauge link\n" 197 str +=
"#define "+
g_re(1,m,n)+
" (+"+
g_re(0,n,m)+
")\n" 198 str +=
"#define "+
g_im(1,m,n)+
" (-"+
g_im(0,n,m)+
")\n" 209 str =
"// output spinor for flavor 1\n" 213 if 2*i < sharedFloatsPerFlavor
and not sharedDslash:
214 str +=
"#define "+
out1_re(s,c)+
" s["+`(2*i+0)`+
"*SHARED_STRIDE]\n" 216 str +=
"VOLATILE spinorFloat "+
out1_re(s,c)+
";\n" 217 if 2*i+1 < sharedFloatsPerFlavor
and not sharedDslash:
218 str +=
"#define "+
out1_im(s,c)+
" s["+`(2*i+1)`+
"*SHARED_STRIDE]\n" 220 str +=
"VOLATILE spinorFloat "+
out1_im(s,c)+
";\n" 222 str +=
"// output spinor for flavor 2\n" 226 if 2*i < sharedFloatsPerFlavor
and not sharedDslash:
227 str +=
"#define "+
out2_re(s,c)+
" s["+`(2*i+0)+sharedFloatsPerFlavor`+
"*SHARED_STRIDE]\n" 229 str +=
"VOLATILE spinorFloat "+
out2_re(s,c)+
";\n" 230 if 2*i+1 < sharedFloatsPerFlavor
and not sharedDslash:
231 str +=
"#define "+
out2_im(s,c)+
" s["+`(2*i+1)+sharedFloatsPerFlavor`+
"*SHARED_STRIDE]\n" 233 str +=
"VOLATILE spinorFloat "+
out2_im(s,c)+
";\n" 242 prolog_str= (
"// *** CUDA NDEG TWISTED MASS DSLASH ***\n\n" if not dagger
else "// *** CUDA NDEG TWISTED MASS DSLASH DAGGER ***\n\n")
243 prolog_str+= (
"// Arguments (double) mu, (double)eta and (double)delta \n")
244 prolog_str+=
"#define SHARED_TMNDEG_FLOATS_PER_THREAD "+str(2*sharedFloatsPerFlavor)+
"\n" 245 prolog_str+=
"#define FLAVORS 2\n\n" 247 print "Undefined prolog" 252 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 254 #else // Open64 compiler 255 #define VOLATILE volatile 260 if dslash ==
True: prolog_str+=
def_gauge()
263 if (sharedFloatsPerFlavor > 0):
268 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 270 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 277 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200 279 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200 286 if sharedFloatsPerFlavor > 0:
289 extern __shared__ char s_data[]; 295 VOLATILE spinorFloat *s = (spinorFloat*)s_data + SHARED_TMNDEG_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE) 296 + (threadIdx.x % SHARED_STRIDE); 302 #include "read_gauge.h" 303 #include "io_spinor.h" 316 if (kernel_type == INTERIOR_KERNEL) { 319 // Assume even dimensions 320 coordsFromIndex3D<EVEN_X>(X, coord, sid, param); 322 // only need to check Y and Z dims currently since X and T set to match exactly 323 if (coord[1] >= param.dc.X[1]) return; 324 if (coord[2] >= param.dc.X[2]) return; 332 if (kernel_type == INTERIOR_KERNEL) { 335 sid = blockIdx.x*blockDim.x + threadIdx.x; 336 if (sid >= param.threads) return; 338 // Assume even dimensions 339 coordsFromIndex<4,QUDA_4D_PC,EVEN_X>(X, coord, sid, param); 359 } else { // exterior kernel 361 sid = blockIdx.x*blockDim.x + threadIdx.x; 362 if (sid >= param.threads) return; 364 const int face_volume = (param.threads >> 1); // volume of one face (per flavor) 365 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1 366 face_idx = sid - face_num*face_volume; // index into the respective face 368 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP) 369 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading) 370 //sp_idx = face_idx + param.ghostOffset[dim]; 372 coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param); 380 READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid); 394 READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid+param.fl_stride, sid+param.fl_stride); 409 prolog_str+=
"#endif // MULTI_GPU\n\n\n" 414 #include "io_spinor.h" 416 int sid = blockIdx.x*blockDim.x + threadIdx.x; 417 if (sid >= param.threads) return; 419 // read spinor from device memory 420 READ_SPINOR(SPINORTEX, param.sp_stride, sid, sid); 426 def gen(dir, pack_only=False):
427 projIdx = dir
if not dagger
else dir + (1 - 2*(dir%2))
430 return projectors[projIdx][4*i+j]
437 return (1, proj(i,1))
439 return (0, proj(i,0))
441 boundary = [
"coord[0]==(param.dc.X[0]-1)",
"coord[0]==0",
"coord[1]==(param.dc.X[1]-1)",
"coord[1]==0",
"coord[2]==(param.dc.X[2]-1)",
"coord[2]==0",
"coord[3]==(param.dc.X[3]-1)",
"coord[3]==0"]
442 interior = [
"coord[0]<(param.dc.X[0]-1)",
"coord[0]>0",
"coord[1]<(param.dc.X[1]-1)",
"coord[1]>0",
"coord[2]<(param.dc.X[2]-1)",
"coord[2]>0",
"coord[3]<(param.dc.X[3]-1)",
"coord[3]>0"]
443 dim = [
"X",
"Y",
"Z",
"T"]
446 sp_idx = [
"X+1",
"X-1",
"X+param.dc.X[0]",
"X-param.dc.X[0]",
"X+param.dc.X2X1",
"X-param.dc.X2X1",
"X+param.dc.X3X2X1",
"X-param.dc.X3X2X1"]
449 sp_idx_wrap = [
"X-(param.dc.X[0]-1)",
"X+(param.dc.X[0]-1)",
"X-param.dc.X2X1mX1",
"X+param.dc.X2X1mX1",
"X-param.dc.X3X2X1mX2X1",
"X+param.dc.X3X2X1mX2X1",
450 "X-param.dc.X4X3X2X1mX3X2X1",
"X+param.dc.X4X3X2X1mX3X2X1"]
453 cond +=
"#ifdef MULTI_GPU\n" 454 cond +=
"if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim["+`dir/2`+
"] || "+interior[dir]+
")) ||\n" 455 cond +=
" (kernel_type == EXTERIOR_KERNEL_"+dim[dir/2]+
" && "+boundary[dir]+
") )\n" 460 projName =
"P"+`dir/2`+[
"-",
"+"][projIdx%2]
461 str +=
"// Projector "+projName+
"\n" 462 for l
in projStr.splitlines():
466 str +=
"#ifdef MULTI_GPU\n" 467 str +=
"const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? ("+boundary[dir]+
" ? "+sp_idx_wrap[dir]+
" : "+sp_idx[dir]+
") >> 1 :\n" 468 str +=
" face_idx + param.ghostOffset[static_cast<int>(kernel_type)][" + `(dir+1)%2` +
"];\n" 469 str +=
"#if (DD_PREC==2) // half precision\n" 470 str +=
"const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][" + `(dir+1)%2` +
"];\n" 473 str +=
"const int sp_idx = ("+boundary[dir]+
" ? "+sp_idx_wrap[dir]+
" : "+sp_idx[dir]+
") >> 1;\n" 478 str +=
"const int ga_idx = sid;\n" 480 str +=
"#ifdef MULTI_GPU\n" 481 str +=
"const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);\n" 483 str +=
"const int ga_idx = sp_idx;\n" 488 row_cnt = ([0,0,0,0])
493 if re != 0
or im != 0:
495 row_cnt[0] += row_cnt[1]
496 row_cnt[2] += row_cnt[3]
499 for h
in range(0, 2):
500 for c
in range(0, 3):
501 decl_half +=
"spinorFloat "+
h1_re(h,c)+
", "+
h1_im(h,c)+
";\n";
504 load_gauge =
"// read gauge matrix from device memory\n" 505 load_gauge +=
"READ_GAUGE_MATRIX(G, GAUGE"+`dir%2`+
"TEX, "+`dir`+
", ga_idx, param.gauge_stride);\n\n" 507 reconstruct_gauge =
"// reconstruct gauge matrix\n" 508 reconstruct_gauge +=
"RECONSTRUCT_GAUGE_MATRIX("+`dir`+
");\n\n" 511 load_flv1 =
"// read flavor 1 from device memory\n" 513 load_flv1 +=
"READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 514 elif row_cnt[2] == 0:
515 load_flv1 +=
"READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 517 load_flv1 +=
"READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n" 521 load_flv2 =
"// read flavor 2 from device memory\n" 523 load_flv2 +=
"READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);\n" 524 elif row_cnt[2] == 0:
525 load_flv2 +=
"READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);\n" 527 load_flv2 +=
"READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);\n" 532 load_half_cond +=
"const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];\n" 537 if dir >= 6: load_half_cond +=
"const int t_proj_scale = TPROJSCALE;\n" 538 load_half_cond +=
"\n" 540 load_half_flv1 =
"// read half spinor for the first flavor from device memory\n" 552 load_half_flv1 +=
"READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, "+`dir`+
");\n\n" 554 load_half_flv2 =
"// read half spinor for the second flavor from device memory\n" 555 load_half_flv2 +=
"const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];\n" 564 load_half_flv2 +=
"READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],"+`dir`+
");\n\n" 566 project =
"// project spinor into half spinors\n" 567 for h
in range(0, 2):
568 for c
in range(0, 3):
571 for s
in range(0, 4):
574 if re==0
and im==0: ()
582 for s
in range(0, 4):
583 re = proj(h+2,s).real
584 im = proj(h+2,s).imag
585 if re==0
and im==0: ()
593 project +=
h1_re(h,c)+
" = "+strRe+
";\n" 594 project +=
h1_im(h,c)+
" = "+strIm+
";\n" 597 """// store spinor into shared memory 598 WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);\n 602 """// load spinor from shared memory 603 int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1; 605 READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);\n 609 """// load spinor from shared memory 610 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x; 611 int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0; 612 READ_SPINOR_SHARED(tx, ty, threadIdx.z);\n 616 """// load spinor from shared memory 617 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x; 618 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1; 619 READ_SPINOR_SHARED(tx, ty, threadIdx.z);\n 623 """// load spinor from shared memory 624 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x; 625 int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0; 626 READ_SPINOR_SHARED(tx, threadIdx.y, tz);\n 630 """// load spinor from shared memory 631 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x; 632 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1; 633 READ_SPINOR_SHARED(tx, threadIdx.y, tz);\n 638 for h
in range(0, 2):
639 for c
in range(0, 3):
640 copy_half +=
h1_re(h,c)+
" = "+(
"t_proj_scale*" if (dir >= 6)
else "")+
in_re(h,c)+
"; " 641 copy_half +=
h1_im(h,c)+
" = "+(
"t_proj_scale*" if (dir >= 6)
else "")+
in_im(h,c)+
";\n" 646 prep_half_cond1 +=
"#ifdef MULTI_GPU\n" 647 prep_half_cond1 +=
"if (kernel_type == INTERIOR_KERNEL) {\n" 648 prep_half_cond1 +=
"#endif\n" 649 prep_half_cond1 +=
"\n" 652 prep_half_flv1 +=
indent(load_flv1)
653 prep_half_flv1 +=
indent(project)
656 prep_half_flv2 +=
indent(load_flv2)
657 prep_half_flv2 +=
indent(project)
659 prep_half_cond2 =
"\n" 660 prep_half_cond2 +=
"#ifdef MULTI_GPU\n" 661 prep_half_cond2 +=
"} else {\n" 662 prep_half_cond2 +=
"\n" 664 prep_face_flv1 =
indent(load_half_flv1)
665 prep_face_flv2 =
indent(load_half_flv2)
667 prep_half =
indent(copy_half)
669 prep_half_cond3 =
"}\n" 670 prep_half_cond3 +=
"#endif // MULTI_GPU\n" 671 prep_half_cond3 +=
"\n" 673 ident =
"// identity gauge matrix\n" 676 ident +=
"spinorFloat "+
h2_re(h,m)+
" = " +
h1_re(h,m) +
"; " 677 ident +=
"spinorFloat "+
h2_im(h,m)+
" = " +
h1_im(h,m) +
";\n" 682 mult +=
"// multiply row "+`m`+
"\n" 684 re =
"spinorFloat "+
h2_re(h,m)+
" = 0;\n" 685 im =
"spinorFloat "+
h2_im(h,m)+
" = 0;\n" 687 re +=
h2_re(h,m) +
" += " +
g_re(dir,m,c) +
" * "+
h1_re(h,c)+
";\n" 688 re +=
h2_re(h,m) +
" -= " +
g_im(dir,m,c) +
" * "+
h1_im(h,c)+
";\n" 689 im +=
h2_im(h,m) +
" += " +
g_re(dir,m,c) +
" * "+
h1_im(h,c)+
";\n" 690 im +=
h2_im(h,m) +
" += " +
g_im(dir,m,c) +
" * "+
h1_re(h,c)+
";\n" 694 reconstruct_flv1 =
"" 701 reconstruct_flv1 +=
out1_re(h_out, m) +
" += " +
h2_re(h,m) +
";\n" 702 reconstruct_flv1 +=
out1_im(h_out, m) +
" += " +
h2_im(h,m) +
";\n" 708 if im == 0
and re == 0:
711 reconstruct_flv1 +=
out1_re(s, m) +
" " +
sign(re) +
"= " +
h2_re(h,m) +
";\n" 712 reconstruct_flv1 +=
out1_im(s, m) +
" " +
sign(re) +
"= " +
h2_im(h,m) +
";\n" 714 reconstruct_flv1 +=
out1_re(s, m) +
" " +
sign(-im) +
"= " +
h2_im(h,m) +
";\n" 715 reconstruct_flv1 +=
out1_im(s, m) +
" " +
sign(+im) +
"= " +
h2_re(h,m) +
";\n" 717 reconstruct_flv1 +=
"\n" 719 reconstruct_flv2 =
"" 726 reconstruct_flv2 +=
out2_re(h_out, m) +
" += " +
h2_re(h,m) +
";\n" 727 reconstruct_flv2 +=
out2_im(h_out, m) +
" += " +
h2_im(h,m) +
";\n" 733 if im == 0
and re == 0:
736 reconstruct_flv2 +=
out2_re(s, m) +
" " +
sign(re) +
"= " +
h2_re(h,m) +
";\n" 737 reconstruct_flv2 +=
out2_im(s, m) +
" " +
sign(re) +
"= " +
h2_im(h,m) +
";\n" 739 reconstruct_flv2 +=
out2_re(s, m) +
" " +
sign(-im) +
"= " +
h2_im(h,m) +
";\n" 740 reconstruct_flv2 +=
out2_im(s, m) +
" " +
sign(+im) +
"= " +
h2_re(h,m) +
";\n" 742 reconstruct_flv2 +=
"\n" 747 str +=
"if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)\n" 748 str +=
block(
"{\n" + prep_half_cond1 + prep_half_flv1 + prep_half_cond2 + load_half_cond + prep_face_flv1 + prep_half + prep_half_cond3 + ident + reconstruct_flv1 +
"}\n" +
"{\n" + prep_half_cond1 + prep_half_flv2 + prep_half_cond2 + load_half_cond + prep_face_flv2 + prep_half + prep_half_cond3 + ident + reconstruct_flv2 +
"}\n")
750 str +=
block(load_gauge + reconstruct_gauge +
"{\n"+ prep_half_cond1 + prep_half_flv1 + prep_half_cond2 + load_half_cond + prep_face_flv1 + prep_half + prep_half_cond3 + mult + reconstruct_flv1 +
"}\n" +
"{\n"+ prep_half_cond1 + prep_half_flv2 + prep_half_cond2 + load_half_cond + prep_face_flv2 + prep_half + prep_half_cond3 + mult + reconstruct_flv2 +
"}\n")
752 str += decl_half + load_gauge + reconstruct_gauge
753 str +=
"{\n" + prep_half_cond1 + prep_half_flv1 + prep_half_cond2 + load_half_cond + prep_face_flv1 + prep_half + prep_half_cond3 + mult + reconstruct_flv1 +
"}\n" 754 str +=
"{\n" + prep_half_cond1 + prep_half_flv2 + prep_half_cond2 + load_half_cond + prep_face_flv2 + prep_half + prep_half_cond3 + mult + reconstruct_flv2 +
"}\n" 757 out = load_spinor + decl_half + project
758 out = out.replace(
"sp_idx",
"idx")
761 return cond +
block(str)+
"\n\n" 768 str +=
"#ifdef SPINOR_DOUBLE\n" 769 str +=
"const spinorFloat a = param.a;\n" 770 str +=
"const spinorFloat b = param.b;\n" 772 str +=
"const spinorFloat a = param.a_f;\n" 773 str +=
"const spinorFloat b = param.b_f;\n" 777 str +=
"//Perform twist rotation first:\n" 779 str +=
"//(1 + i*a*gamma_5 * tau_3 + b * tau_1)\n" 781 str +=
"//(1 - i*a*gamma_5 * tau_3 + b * tau_1)\n" 782 str +=
"volatile spinorFloat x1_re, x1_im, y1_re, y1_im;\n" 783 str +=
"volatile spinorFloat x2_re, x2_im, y2_re, y2_im;\n\n" 785 str +=
"x1_re = 0.0, x1_im = 0.0;\n" 786 str +=
"y1_re = 0.0, y1_im = 0.0;\n" 787 str +=
"x2_re = 0.0, x2_im = 0.0;\n" 788 str +=
"y2_re = 0.0, y2_im = 0.0;\n\n\n" 803 str +=
"// using o1 regs:\n" 806 str +=
"x2_re = " +
"b * " +
out1_re(h,c) +
";\n" 807 str +=
"x2_im = " +
"b * " +
out1_im(h,c) +
";\n\n" 810 str +=
"y2_re = " +
"b * " +
out1_re(h+2,c) +
";\n" 811 str +=
"y2_im = " +
"b * " +
out1_im(h+2,c) +
";\n\n\n" 812 str +=
"// using o2 regs:\n" 815 str +=
"x1_re += " +
"b * " +
out2_re(h,c) +
";\n" 816 str +=
"x1_im += " +
"b * " +
out2_im(h,c) +
";\n\n" 819 str +=
"y1_re += " +
"b * " +
out2_re(h+2,c) +
";\n" 820 str +=
"y1_im += " +
"b * " +
out2_im(h+2,c) +
";\n" 823 str +=
out1_re(h+2,c) +
" = y1_re; " +
out1_im(h+2,c) +
" = y1_im;\n" 826 str +=
out2_re(h+2,c) +
" = y2_re; " +
out2_im(h+2,c) +
" = y2_im;\n\n" 829 return "#ifdef DSLASH_TWIST\n" +
block(str) +
"\n#endif\n" 836 str +=
"#if !defined(DSLASH_XPAY) || defined(DSLASH_TWIST)\n" 837 str +=
"#ifdef SPINOR_DOUBLE\n" 838 str +=
"const spinorFloat c = param.c;\n" 840 str +=
"const spinorFloat c = param.c_f;\n" 844 str +=
"#ifndef DSLASH_XPAY\n" 849 str +=
out1_re(s,c) +
" *= c;\n" 850 str +=
out1_im(s,c) +
" *= c;\n" 856 str +=
out2_re(s,c) +
" *= c;\n" 857 str +=
out2_im(s,c) +
" *= c;\n" 862 str +=
"#ifdef DSLASH_TWIST\n" 863 str +=
"// accum spinor\n" 864 str +=
"#ifdef SPINOR_DOUBLE\n" 879 str +=
"#endif // SPINOR_DOUBLE\n\n" 881 str +=
" READ_ACCUM(ACCUMTEX, param.sp_stride)\n\n" 888 str +=
" ASSN_ACCUM(ACCUMTEX, param.sp_stride, param.fl_stride)\n\n" 899 str +=
"#undef "+
acc_re(s,c)+
"\n" 900 str +=
"#undef "+
acc_im(s,c)+
"\n" 904 str +=
"// accum spinor\n" 905 str +=
"#ifdef SPINOR_DOUBLE\n" 934 str +=
"#endif // SPINOR_DOUBLE\n\n" 938 str +=
" READ_ACCUM_FLAVOR(ACCUMTEX, param.sp_stride, param.fl_stride)\n\n" 940 str +=
"#ifdef SPINOR_DOUBLE\n" 941 str +=
"const spinorFloat a = param.a;\n" 942 str +=
"const spinorFloat b = param.b;\n" 944 str +=
"const spinorFloat a = param.a_f;\n" 945 str +=
"const spinorFloat b = param.b_f;\n" 948 str +=
" //Perform twist rotation:\n" 950 str +=
"//(1 + i*a*gamma_5 * tau_3 + b * tau_1)\n" 952 str +=
"//(1 - i*a*gamma_5 * tau_3 + b * tau_1)\n" 953 str +=
" volatile spinorFloat x1_re, x1_im, y1_re, y1_im;\n" 954 str +=
" volatile spinorFloat x2_re, x2_im, y2_re, y2_im;\n\n" 956 str +=
" x1_re = 0.0, x1_im = 0.0;\n" 957 str +=
" y1_re = 0.0, y1_im = 0.0;\n" 958 str +=
" x2_re = 0.0, x2_im = 0.0;\n" 959 str +=
" y2_re = 0.0, y2_im = 0.0;\n\n\n" 974 str +=
" // using acc1 regs:\n" 977 str +=
" x2_re = " +
"b * " +
acc1_re(h,c) +
";\n" 978 str +=
" x2_im = " +
"b * " +
acc1_im(h,c) +
";\n\n" 981 str +=
" y2_re = " +
"b * " +
acc1_re(h+2,c) +
";\n" 982 str +=
" y2_im = " +
"b * " +
acc1_im(h+2,c) +
";\n\n\n" 983 str +=
" // using acc2 regs:\n" 986 str +=
" x1_re += " +
"b * " +
acc2_re(h,c) +
";\n" 987 str +=
" x1_im += " +
"b * " +
acc2_im(h,c) +
";\n\n" 990 str +=
" y1_re += " +
"b * " +
acc2_re(h+2,c) +
";\n" 991 str +=
" y1_im += " +
"b * " +
acc2_im(h+2,c) +
";\n" 994 str +=
acc1_re(h+2,c) +
" = y1_re; " +
acc1_im(h+2,c) +
" = y1_im;\n" 997 str +=
acc2_re(h+2,c) +
" = y2_re; " +
acc2_im(h+2,c) +
" = y2_im;\n\n" 1000 str +=
"#ifdef SPINOR_DOUBLE\n" 1001 str +=
"const spinorFloat k = param.d;\n" 1003 str +=
"const spinorFloat k = param.d_f;\n" 1006 for s
in range(0,4):
1007 for c
in range(0,3):
1014 for s
in range(0,4):
1015 for c
in range(0,3):
1022 for s
in range(0,4):
1023 for c
in range(0,3):
1025 str +=
"#undef "+
acc1_re(s,c)+
"\n" 1026 str +=
"#undef "+
acc1_im(s,c)+
"\n" 1028 for s
in range(0,4):
1029 for c
in range(0,3):
1031 str +=
"#undef "+
acc2_re(s,c)+
"\n" 1032 str +=
"#undef "+
acc2_im(s,c)+
"\n" 1034 str +=
"#endif//DSLASH_TWIST\n" 1036 str +=
"#endif // DSLASH_XPAY\n" 1045 str +=
"#ifdef MULTI_GPU\n" 1048 int incomplete = 0; // Have all 8 contributions been computed for this site? 1050 switch(kernel_type) { // intentional fall-through 1051 case INTERIOR_KERNEL: 1052 incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1))); 1053 case EXTERIOR_KERNEL_T: 1054 incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1))); 1055 case EXTERIOR_KERNEL_Z: 1056 incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1))); 1057 case EXTERIOR_KERNEL_Y: 1058 incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1))); 1063 str +=
"if (!incomplete)\n" 1064 str +=
"#endif // MULTI_GPU\n" 1065 str +=
"// apply twisted mass rotation\n" 1069 str +=
"// write spinor field back to device memory\n" 1070 str +=
"WRITE_FLAVOR_SPINOR();\n\n" 1072 str +=
"// undefine to prevent warning when precision is changed\n" 1073 str +=
"#undef spinorFloat\n" 1075 str +=
"#undef WRITE_SPINOR_SHARED\n" 1076 str +=
"#undef READ_SPINOR_SHARED\n" 1077 if sharedFloatsPerFlavor > 0: str +=
"#undef SHARED_STRIDE\n\n" 1080 for m
in range(0,3):
1081 for n
in range(0,3):
1083 str +=
"#undef "+
g_re(0,m,n)+
"\n" 1084 str +=
"#undef "+
g_im(0,m,n)+
"\n" 1087 for s
in range(0,4):
1088 for c
in range(0,3):
1090 str +=
"#undef "+
in_re(s,c)+
"\n" 1091 str +=
"#undef "+
in_im(s,c)+
"\n" 1094 for s
in range(0,4):
1095 for c
in range(0,3):
1097 if 2*i < sharedFloatsPerFlavor:
1098 str +=
"#undef "+
out1_re(s,c)+
"\n" 1099 if 2*i+1 < sharedFloatsPerFlavor:
1100 str +=
"#undef "+
out1_im(s,c)+
"\n" 1103 str +=
"#undef VOLATILE\n" 1117 print "Generating dslash kernel for sm" + str(arch/10)
1119 global sharedFloatsPerFlavor
1125 sharedFloatsPerFlavor = 0
1127 sharedFloatsPerFlavor = 0
1129 sharedDslash =
False 1132 sharedFloatsPerFlavor = 0
1133 sharedDslash =
False 1136 sharedFloatsPerFlavor = 19
1137 sharedDslash =
False 1140 print "Shared floats set to " + str(sharedFloatsPerFlavor)
1148 filename =
'dslash_core/tm_ndeg_dslash_core.h' 1149 print sys.argv[0] +
": generating " + filename;
1150 f = open(filename,
'w')
1155 filename =
'dslash_core/tm_ndeg_dslash_dagger_core.h' 1156 print sys.argv[0] +
": generating " + filename +
"\n";
1157 f = open(filename,
'w')
1168 sharedFloatsPerFlavor = 0
1169 sharedDslash =
False
def indent(code)
code generation ######################################################################## ...
def generate_dslash()
temporal
def gen(dir, pack_only=False)
def generate_dslash_kernels(arch)
def complexify(a)
complex numbers ######################################################################## ...