QUDA  0.9.0
fused_exterior_ndeg_tm_dslash_cuda_gen.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 import sys
3 
4 
5 
6 def complexify(a):
7  return [complex(x) for x in a]
8 
9 def complexToStr(c):
10  def fltToString(a):
11  if a == int(a): return `int(a)`
12  else: return `a`
13 
14  def imToString(a):
15  if a == 0: return "0i"
16  elif a == -1: return "-i"
17  elif a == 1: return "i"
18  else: return fltToString(a)+"i"
19 
20  re = c.real
21  im = c.imag
22  if re == 0 and im == 0: return "0"
23  elif re == 0: return imToString(im)
24  elif im == 0: return fltToString(re)
25  else:
26  im_str = "-"+imToString(-im) if im < 0 else "+"+imToString(im)
27  return fltToString(re)+im_str
28 
29 
30 
31 
32 id = complexify([
33  1, 0, 0, 0,
34  0, 1, 0, 0,
35  0, 0, 1, 0,
36  0, 0, 0, 1
37 ])
38 
39 gamma1 = complexify([
40  0, 0, 0, 1j,
41  0, 0, 1j, 0,
42  0, -1j, 0, 0,
43  -1j, 0, 0, 0
44 ])
45 
46 gamma2 = complexify([
47  0, 0, 0, 1,
48  0, 0, -1, 0,
49  0, -1, 0, 0,
50  1, 0, 0, 0
51 ])
52 
53 gamma3 = complexify([
54  0, 0, 1j, 0,
55  0, 0, 0, -1j,
56  -1j, 0, 0, 0,
57  0, 1j, 0, 0
58 ])
59 
60 gamma4 = complexify([
61  1, 0, 0, 0,
62  0, 1, 0, 0,
63  0, 0, -1, 0,
64  0, 0, 0, -1
65 ])
66 
67 igamma5 = complexify([
68  0, 0, 1j, 0,
69  0, 0, 0, 1j,
70  1j, 0, 0, 0,
71  0, 1j, 0, 0
72 ])
73 
74 
75 def gplus(g1, g2):
76  return [x+y for (x,y) in zip(g1,g2)]
77 
78 def gminus(g1, g2):
79  return [x-y for (x,y) in zip(g1,g2)]
80 
82  out = ""
83  for i in range(0, 4):
84  for j in range(0,4):
85  out += complexToStr(p[4*i+j]) + " "
86  out += "\n"
87  return out
88 
89 projectors = [
90  gminus(id,gamma1), gplus(id,gamma1),
91  gminus(id,gamma2), gplus(id,gamma2),
92  gminus(id,gamma3), gplus(id,gamma3),
93  gminus(id,gamma4), gplus(id,gamma4),
94 ]
95 
96 
97 
98 def indent(code):
99  def indentline(line): return (" "+line if (line.count("#", 0, 1) == 0) else line)
100  return ''.join([indentline(line)+"\n" for line in code.splitlines()])
101 
102 def block(code):
103  return "{\n"+indent(code)+"}"
104 
105 def sign(x):
106  if x==1: return "+"
107  elif x==-1: return "-"
108  elif x==+2: return "+2*"
109  elif x==-2: return "-2*"
110 
111 def nthFloat4(n):
112  return `(n/4)` + "." + ["x", "y", "z", "w"][n%4]
113 
114 def nthFloat2(n):
115  return `(n/2)` + "." + ["x", "y"][n%2]
116 
117 
118 def in_re(s, c): return "i"+`s`+`c`+"_re"
119 def in_im(s, c): return "i"+`s`+`c`+"_im"
120 def g_re(d, m, n): return ("g" if (d%2==0) else "gT")+`m`+`n`+"_re"
121 def g_im(d, m, n): return ("g" if (d%2==0) else "gT")+`m`+`n`+"_im"
122 def out1_re(s, c): return "o1_"+`s`+`c`+"_re"
123 def out1_im(s, c): return "o1_"+`s`+`c`+"_im"
124 def out2_re(s, c): return "o2_"+`s`+`c`+"_re"
125 def out2_im(s, c): return "o2_"+`s`+`c`+"_im"
126 def h1_re(h, c): return ["a","b"][h]+`c`+"_re"
127 def h1_im(h, c): return ["a","b"][h]+`c`+"_im"
128 def h2_re(h, c): return ["A","B"][h]+`c`+"_re"
129 def h2_im(h, c): return ["A","B"][h]+`c`+"_im"
130 def a_re(b, s, c): return "a"+`(s+2*b)`+`c`+"_re"
131 def a_im(b, s, c): return "a"+`(s+2*b)`+`c`+"_im"
132 
133 def tmp_re(s, c): return "tmp"+`s`+`c`+"_re"
134 def tmp_im(s, c): return "tmp"+`s`+`c`+"_im"
135 
136 def acc_re(s, c): return "acc_"+`s`+`c`+"_re"
137 def acc_im(s, c): return "acc_"+`s`+`c`+"_im"
138 def acc1_re(s, c): return "acc1_"+`s`+`c`+"_re"
139 def acc1_im(s, c): return "acc1_"+`s`+`c`+"_im"
140 def acc2_re(s, c): return "acc2_"+`s`+`c`+"_re"
141 def acc2_im(s, c): return "acc2_"+`s`+`c`+"_im"
142 
143 
145  str = ""
146  str += "// input spinor\n"
147  str += "#ifdef SPINOR_DOUBLE\n"
148  str += "#define spinorFloat double\n"
149  if sharedDslash:
150  str += "#define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2\n"
151  str += "#define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2\n"
152 
153  for s in range(0,4):
154  for c in range(0,3):
155  i = 3*s+c
156  str += "#define "+in_re(s,c)+" I"+nthFloat2(2*i+0)+"\n"
157  str += "#define "+in_im(s,c)+" I"+nthFloat2(2*i+1)+"\n"
158  str += "#else\n"
159  str += "#define spinorFloat float\n"
160  if sharedDslash:
161  str += "#define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4\n"
162  str += "#define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4\n"
163  for s in range(0,4):
164  for c in range(0,3):
165  i = 3*s+c
166  str += "#define "+in_re(s,c)+" I"+nthFloat4(2*i+0)+"\n"
167  str += "#define "+in_im(s,c)+" I"+nthFloat4(2*i+1)+"\n"
168  str += "#endif // SPINOR_DOUBLE\n\n"
169  return str
170 # end def def_input_spinor
171 
172 
173 def def_gauge():
174  str = "// gauge link\n"
175  str += "#ifdef GAUGE_FLOAT2\n"
176  for m in range(0,3):
177  for n in range(0,3):
178  i = 3*m+n
179  str += "#define "+g_re(0,m,n)+" G"+nthFloat2(2*i+0)+"\n"
180  str += "#define "+g_im(0,m,n)+" G"+nthFloat2(2*i+1)+"\n"
181 
182  str += "\n"
183  str += "#else\n"
184  for m in range(0,3):
185  for n in range(0,3):
186  i = 3*m+n
187  str += "#define "+g_re(0,m,n)+" G"+nthFloat4(2*i+0)+"\n"
188  str += "#define "+g_im(0,m,n)+" G"+nthFloat4(2*i+1)+"\n"
189 
190  str += "\n"
191  str += "#endif // GAUGE_DOUBLE\n\n"
192 
193  str += "// conjugated gauge link\n"
194  for m in range(0,3):
195  for n in range(0,3):
196  i = 3*m+n
197  str += "#define "+g_re(1,m,n)+" (+"+g_re(0,n,m)+")\n"
198  str += "#define "+g_im(1,m,n)+" (-"+g_im(0,n,m)+")\n"
199  str += "\n"
200 
201  return str
202 # end def def_gauge
203 
204 
205 
207 # sharedDslash = True: input spinors stored in shared memory
208 # sharedDslash = False: output spinors stored in shared memory
209  str = "// output spinor for flavor 1\n"
210  for s in range(0,4):
211  for c in range(0,3):
212  i = 3*s+c
213  if 2*i < sharedFloatsPerFlavor and not sharedDslash:
214  str += "#define "+out1_re(s,c)+" s["+`(2*i+0)`+"*SHARED_STRIDE]\n"
215  else:
216  str += "VOLATILE spinorFloat "+out1_re(s,c)+";\n"
217  if 2*i+1 < sharedFloatsPerFlavor and not sharedDslash:
218  str += "#define "+out1_im(s,c)+" s["+`(2*i+1)`+"*SHARED_STRIDE]\n"
219  else:
220  str += "VOLATILE spinorFloat "+out1_im(s,c)+";\n"
221 
222  str += "// output spinor for flavor 2\n"
223  for s in range(0,4):
224  for c in range(0,3):
225  i = 3*s+c
226  if 2*i < sharedFloatsPerFlavor and not sharedDslash:
227  str += "#define "+out2_re(s,c)+" s["+`(2*i+0)+sharedFloatsPerFlavor`+"*SHARED_STRIDE]\n"
228  else:
229  str += "VOLATILE spinorFloat "+out2_re(s,c)+";\n"
230  if 2*i+1 < sharedFloatsPerFlavor and not sharedDslash:
231  str += "#define "+out2_im(s,c)+" s["+`(2*i+1)+sharedFloatsPerFlavor`+"*SHARED_STRIDE]\n"
232  else:
233  str += "VOLATILE spinorFloat "+out2_im(s,c)+";\n"
234  return str
235 # end def def_output_spinor
236 
237 
238 def prolog():
239  global arch
240 #WARNING: change for twisted mass!
241  if dslash:
242  prolog_str= ("// *** CUDA NDEG TWISTED MASS DSLASH ***\n\n" if not dagger else "// *** CUDA NDEG TWISTED MASS DSLASH DAGGER ***\n\n")
243  prolog_str+= ("// Arguments (double) mu, (double)eta and (double)delta \n")
244  prolog_str+= "#define SHARED_TMNDEG_FLOATS_PER_THREAD "+str(2*sharedFloatsPerFlavor)+"\n"
245  prolog_str+= "#define FLAVORS 2\n\n"
246  else:
247  print "Undefined prolog"
248  exit
249 
250  prolog_str+= (
251 """
252 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
253 #define VOLATILE
254 #else // Open64 compiler
255 #define VOLATILE volatile
256 #endif
257 """)
258 
259  prolog_str+= def_input_spinor()
260  if dslash == True: prolog_str+= def_gauge()
261  prolog_str+= def_output_spinor()
262 
263  if (sharedFloatsPerFlavor > 0):
264  if (arch >= 200):
265  prolog_str+= (
266 """
267 #ifdef SPINOR_DOUBLE
268 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
269 #else
270 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
271 #endif
272 """)
273  else:
274  prolog_str+= (
275 """
276 #ifdef SPINOR_DOUBLE
277 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
278 #else
279 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
280 #endif
281 """)
282 
283 
284  # set the pointer if using shared memory for pseudo registers
285 # if sharedFloatsPerFlavor > 0 and not sharedDslash:
286  if sharedFloatsPerFlavor > 0:
287  prolog_str += (
288 """
289 extern __shared__ char s_data[];
290 """)
291 
292  if dslash:
293  prolog_str += (
294 """
295 VOLATILE spinorFloat *s = (spinorFloat*)s_data + SHARED_TMNDEG_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
296  + (threadIdx.x % SHARED_STRIDE);
297 """)
298 
299  if dslash:
300  prolog_str += (
301 """
302 #include "read_gauge.h"
303 #include "io_spinor.h"
304 
305 int coord[5];
306 int X;
307 
308 #if (DD_PREC==2) // half precision
309 int sp_norm_idx;
310 #endif // MULTI_GPU half precision
311 
312 int sid;
313 """)
314 
315  prolog_str+= (
316 """
317  sid = blockIdx.x*blockDim.x + threadIdx.x;
318  if (sid >= param.threads) return;
319 
320  int dim = dimFromFaceIndex(sid, param); // sid is also modified
321 
322  const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim]) >> 1);
323  // volume of one face (per flavor)
324  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
325  int face_idx = sid - face_num*face_volume; // index into the respective face
326 
327  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
328  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
329  // 4-d for first template argument here since both flavor are done by the same thread
330  switch(dim) {
331  case 0:
332  coordsFromFaceIndex<4,QUDA_4D_PC,0,1>(X, sid, coord, face_idx, face_num, param);
333  break;
334  case 1:
335  coordsFromFaceIndex<4,QUDA_4D_PC,1,1>(X, sid, coord, face_idx, face_num, param);
336  break;
337  case 2:
338  coordsFromFaceIndex<4,QUDA_4D_PC,2,1>(X, sid, coord, face_idx, face_num, param);
339  break;
340  case 3:
341  coordsFromFaceIndex<4,QUDA_4D_PC,3,1>(X, sid, coord, face_idx, face_num, param);
342  break;
343  }
344 
345 
346  bool active = false;
347  for(int dir=0; dir<4; ++dir){
348  active = active || isActive(dim,dir,+1,coord,param.commDim,param.dc.X);
349  }
350  if(!active) return;
351 
352 """)
353 
354 #for flavor 1:
355  prolog_str+= (
356 """
357  {
358  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
359 """)
360 
361  out1 = " "
362  for s in range(0,4):
363  for c in range(0,3):
364  out1 += out1_re(s,c)+" = "+in_re(s,c)+"; "+out1_im(s,c)+" = "+in_im(s,c)+";\n "
365  prolog_str+= indent(out1)
366 
367 #for flavor 2:
368  prolog_str+= (
369 """
370  }
371  {
372  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid+param.fl_stride, sid+param.fl_stride);
373 """)
374 
375  out2 = " "
376  for s in range(0,4):
377  for c in range(0,3):
378  out2 += out2_re(s,c)+" = "+in_re(s,c)+"; "+out2_im(s,c)+" = "+in_im(s,c)+";\n "
379  prolog_str+= indent(out2)
380  prolog_str+= (
381 """
382  }
383 """)
384 
385 
386  prolog_str+= "\n"
387 
388  else:
389  prolog_str+=(
390 """
391 #include "io_spinor.h"
392 
393 int sid = blockIdx.x*blockDim.x + threadIdx.x;
394 if (sid >= param.threads) return;
395 
396 // read spinor from device memory
397 READ_SPINOR(SPINORTEX, param.sp_stride, sid, sid);
398 """)
399  return prolog_str
400 # end def prolog
401 
402 
403 def gen(dir, pack_only=False):
404  projIdx = dir if not dagger else dir + (1 - 2*(dir%2))
405  projStr = projectorToStr(projectors[projIdx])
406  def proj(i,j):
407  return projectors[projIdx][4*i+j]
408 
409  # if row(i) = (j, c), then the i'th row of the projector can be represented
410  # as a multiple of the j'th row: row(i) = c row(j)
411  def row(i):
412  assert i==2 or i==3
413  if proj(i,0) == 0j:
414  return (1, proj(i,1))
415  if proj(i,1) == 0j:
416  return (0, proj(i,0))
417 
418  boundary = ["coord[0]==(param.dc.X[0]-1)", "coord[0]==0", "coord[1]==(param.dc.X[1]-1)", "coord[1]==0", "coord[2]==(param.dc.X[2]-1)", "coord[2]==0", "coord[3]==(param.dc.X[3]-1)", "coord[3]==0"]
419  interior = ["coord[0]<(param.dc.X[0]-1)", "coord[0]>0", "coord[1]<(param.dc.X[1]-1)", "coord[1]>0", "coord[2]<(param.dc.X[2]-1)", "coord[2]>0", "coord[3]<(param.dc.X[3]-1)", "coord[3]>0"]
420  offset = ["+1","-1","+1","-1","+1","-1","+1","-1"];
421  dim = ["X", "Y", "Z", "T"]
422 
423  # index of neighboring site when not on boundary
424  sp_idx = ["X+1", "X-1", "X+param.dc.X[0]", "X-param.dc.X[0]", "X+param.dc.X2X1", "X-param.dc.X2X1", "X+param.dc.X3X2X1", "X-param.dc.X3X2X1"]
425 
426  # index of neighboring site (across boundary)
427  sp_idx_wrap = ["X-(param.dc.X[0]-1)", "X+(param.dc.X[0]-1)", "X-param.dc.X2X1mX1", "X+param.dc.X2X1mX1", "X-param.dc.X3X2X1mX2X1", "X+param.dc.X3X2X1mX2X1",
428  "X-param.dc.X4X3X2X1mX3X2X1", "X+param.dc.X4X3X2X1mX3X2X1"]
429 
430  cond = ""
431 # cond += "#ifdef MULTI_GPU\n"
432 # cond += "if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim["+`dir/2`+"] || "+interior[dir]+")) ||\n"
433 # cond += " (kernel_type == EXTERIOR_KERNEL_"+dim[dir/2]+" && "+boundary[dir]+") )\n"
434 # cond += "#endif\n"
435  cond += "if (isActive(dim," + `dir/2` + "," + offset[dir] + ",coord,param.commDim,param.dc.X) && " + boundary[dir] +")\n"
436 
437  str = ""
438 
439  projName = "P"+`dir/2`+["-","+"][projIdx%2]
440  str += "// Projector "+projName+"\n"
441  for l in projStr.splitlines():
442  str += "// "+l+"\n"
443  str += "\n"
444 
445  str += "faceIndexFromCoords<4,1>(face_idx,coord," + `dir/2` + ",param);\n"
446  str += "const int sp_idx = face_idx + param.ghostOffset[" + `dir/2` + "][" + `1-dir%2` + "];\n"
447 
448  str += "#if (DD_PREC==2)\n"
449  str += " sp_norm_idx = face_idx + "
450  str += "param.ghostNormOffset[" + `dir/2` + "][" + `1-dir%2` + "];\n"
451  str += "#endif"
452  str += "\n"
453 
454  if dir % 2 == 0:
455  str += "const int ga_idx = sid;\n"
456  else:
457  str += "const int ga_idx = param.dc.Vh+face_idx;\n"
458  str += "\n"
459 
460  # scan the projector to determine which loads are required
461  row_cnt = ([0,0,0,0])
462  for h in range(0,4):
463  for s in range(0,4):
464  re = proj(h,s).real
465  im = proj(h,s).imag
466  if re != 0 or im != 0:
467  row_cnt[h] += 1
468  row_cnt[0] += row_cnt[1]
469  row_cnt[2] += row_cnt[3]
470 
471  decl_half = ""
472  for h in range(0, 2):
473  for c in range(0, 3):
474  decl_half += "spinorFloat "+h1_re(h,c)+", "+h1_im(h,c)+";\n";
475  decl_half += "\n"
476 
477  load_gauge = "// read gauge matrix from device memory\n"
478  load_gauge += "READ_GAUGE_MATRIX(G, GAUGE"+`dir%2`+"TEX, "+`dir`+", ga_idx, param.gauge_stride);\n\n"
479 
480  reconstruct_gauge = "// reconstruct gauge matrix\n"
481  reconstruct_gauge += "RECONSTRUCT_GAUGE_MATRIX("+`dir`+");\n\n"
482 
483 #flavor 1:
484  load_flv1 = "// read flavor 1 from device memory\n"
485  if row_cnt[0] == 0:
486  load_flv1 += "READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n"
487  elif row_cnt[2] == 0:
488  load_flv1 += "READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n"
489  else:
490  load_flv1 += "READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n"
491  load_flv1 += "\n"
492 
493 #flavor 2:
494  load_flv2 = "// read flavor 2 from device memory\n"
495  if row_cnt[0] == 0:
496  load_flv2 += "READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);\n"
497  elif row_cnt[2] == 0:
498  load_flv2 += "READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);\n"
499  else:
500  load_flv2 += "READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);\n"
501  load_flv2 += "\n"
502 
503 
504  load_half_cond = ""
505  load_half_cond += "const int sp_stride_pad = FLAVORS*param.dc.ghostFace[" + `dir/2` + "];\n"
506  #load_half += "#if (DD_PREC==2) // half precision\n"
507  #load_half += "const int sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];\n"
508  #load_half += "#endif\n"
509 
510  if dir >= 6: load_half_cond += "const int t_proj_scale = TPROJSCALE;\n"
511  load_half_cond += "\n"
512 
513  load_half_flv1 = "// read half spinor for the first flavor from device memory\n"
514 # we have to use the same volume index for backwards and forwards gathers
515 # instead of using READ_UP_SPINOR and READ_DOWN_SPINOR, just use READ_HALF_SPINOR with the appropriate shift
516 # if (dir+1) % 2 == 0:
517 # load_half_flv1 += "READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);\n\n"
518 # else:
519 #flavor offset: extra param.dc.ghostFace[static_cast<int>(kernel_type)]
520 # load_half_flv1 += "READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);\n\n"
521  load_half_flv1 += "READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, "+`dir`+");\n\n"
522 
523  load_half_flv2 = "// read half spinor for the second flavor from device memory\n"
524  load_half_flv2 += "const int fl_idx = sp_idx + param.dc.ghostFace[" + `dir/2` + "];\n"
525  load_half_flv2 += "#if (DD_PREC==2)\n"
526  load_half_flv2 += "const int fl_norm_idx = sp_norm_idx + param.dc.ghostFace[" + `dir/2` + "];\n"
527  load_half_flv2 += "#endif\n"
528 # we have to use the same volume index for backwards and forwards gathers
529 # instead of using READ_UP_SPINOR and READ_DOWN_SPINOR, just use READ_HALF_SPINOR with the appropriate shift
530 # if (dir+1) % 2 == 0:
531 # load_half_flv2 += "READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[" + `dir/2` + "]);\n\n"
532 # else:
533 #flavor offset: extra param.dc.ghostFace[static_cast<int>(kernel_type)]
534 # load_half_flv2 += "READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+param.dc.ghostFace[" + `dir/2` + "]);\n\n"
535  load_half_flv2 += "READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, fl_norm_idx, "+`dir`+");\n\n"
536 
537 
538  project = "// project spinor into half spinors\n"
539  for h in range(0, 2):
540  for c in range(0, 3):
541  strRe = ""
542  strIm = ""
543  for s in range(0, 4):
544  re = proj(h,s).real
545  im = proj(h,s).imag
546  if re==0 and im==0: ()
547  elif im==0:
548  strRe += sign(re)+in_re(s,c)
549  strIm += sign(re)+in_im(s,c)
550  elif re==0:
551  strRe += sign(-im)+in_im(s,c)
552  strIm += sign(im)+in_re(s,c)
553  if row_cnt[0] == 0: # projector defined on lower half only
554  for s in range(0, 4):
555  re = proj(h+2,s).real
556  im = proj(h+2,s).imag
557  if re==0 and im==0: ()
558  elif im==0:
559  strRe += sign(re)+in_re(s,c)
560  strIm += sign(re)+in_im(s,c)
561  elif re==0:
562  strRe += sign(-im)+in_im(s,c)
563  strIm += sign(im)+in_re(s,c)
564 
565  project += h1_re(h,c)+" = "+strRe+";\n"
566  project += h1_im(h,c)+" = "+strIm+";\n"
567 
568  write_shared = (
569 """// store spinor into shared memory
570 WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);\n
571 """)
572 
573  load_shared_1 = (
574 """// load spinor from shared memory
575 int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
576 __syncthreads();
577 READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);\n
578 """)
579 
580  load_shared_2 = (
581 """// load spinor from shared memory
582 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
583 int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
584 READ_SPINOR_SHARED(tx, ty, threadIdx.z);\n
585 """)
586 
587  load_shared_3 = (
588 """// load spinor from shared memory
589 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
590 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
591 READ_SPINOR_SHARED(tx, ty, threadIdx.z);\n
592 """)
593 
594  load_shared_4 = (
595 """// load spinor from shared memory
596 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
597 int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
598 READ_SPINOR_SHARED(tx, threadIdx.y, tz);\n
599 """)
600 
601  load_shared_5 = (
602 """// load spinor from shared memory
603 int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
604 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
605 READ_SPINOR_SHARED(tx, threadIdx.y, tz);\n
606 """)
607 
608 
609  copy_half = ""
610  for h in range(0, 2):
611  for c in range(0, 3):
612  copy_half += h1_re(h,c)+" = "+("t_proj_scale*" if (dir >= 6) else "")+in_re(h,c)+"; "
613  copy_half += h1_im(h,c)+" = "+("t_proj_scale*" if (dir >= 6) else "")+in_im(h,c)+";\n"
614 
615  copy_half += "\n"
616 
617  prep_face_flv1 = indent(load_half_flv1)
618  prep_face_flv2 = indent(load_half_flv2)
619 
620  prep_half = indent(copy_half)
621 
622  ident = "// identity gauge matrix\n"
623  for m in range(0,3):
624  for h in range(0,2):
625  ident += "spinorFloat "+h2_re(h,m)+" = " + h1_re(h,m) + "; "
626  ident += "spinorFloat "+h2_im(h,m)+" = " + h1_im(h,m) + ";\n"
627  ident += "\n"
628 
629  mult = ""
630  for m in range(0,3):
631  mult += "// multiply row "+`m`+"\n"
632  for h in range(0,2):
633  re = "spinorFloat "+h2_re(h,m)+" = 0;\n"
634  im = "spinorFloat "+h2_im(h,m)+" = 0;\n"
635  for c in range(0,3):
636  re += h2_re(h,m) + " += " + g_re(dir,m,c) + " * "+h1_re(h,c)+";\n"
637  re += h2_re(h,m) + " -= " + g_im(dir,m,c) + " * "+h1_im(h,c)+";\n"
638  im += h2_im(h,m) + " += " + g_re(dir,m,c) + " * "+h1_im(h,c)+";\n"
639  im += h2_im(h,m) + " += " + g_im(dir,m,c) + " * "+h1_re(h,c)+";\n"
640  mult += re + im
641  mult += "\n"
642 
643  reconstruct_flv1 = ""
644  for m in range(0,3):
645 
646  for h in range(0,2):
647  h_out = h
648  if row_cnt[0] == 0: # projector defined on lower half only
649  h_out = h+2
650  reconstruct_flv1 += out1_re(h_out, m) + " += " + h2_re(h,m) + ";\n"
651  reconstruct_flv1 += out1_im(h_out, m) + " += " + h2_im(h,m) + ";\n"
652 
653  for s in range(2,4):
654  (h,c) = row(s)
655  re = c.real
656  im = c.imag
657  if im == 0 and re == 0:
658  ()
659  elif im == 0:
660  reconstruct_flv1 += out1_re(s, m) + " " + sign(re) + "= " + h2_re(h,m) + ";\n"
661  reconstruct_flv1 += out1_im(s, m) + " " + sign(re) + "= " + h2_im(h,m) + ";\n"
662  elif re == 0:
663  reconstruct_flv1 += out1_re(s, m) + " " + sign(-im) + "= " + h2_im(h,m) + ";\n"
664  reconstruct_flv1 += out1_im(s, m) + " " + sign(+im) + "= " + h2_re(h,m) + ";\n"
665 
666  reconstruct_flv1 += "\n"
667 
668  reconstruct_flv2 = ""
669  for m in range(0,3):
670 
671  for h in range(0,2):
672  h_out = h
673  if row_cnt[0] == 0: # projector defined on lower half only
674  h_out = h+2
675  reconstruct_flv2 += out2_re(h_out, m) + " += " + h2_re(h,m) + ";\n"
676  reconstruct_flv2 += out2_im(h_out, m) + " += " + h2_im(h,m) + ";\n"
677 
678  for s in range(2,4):
679  (h,c) = row(s)
680  re = c.real
681  im = c.imag
682  if im == 0 and re == 0:
683  ()
684  elif im == 0:
685  reconstruct_flv2 += out2_re(s, m) + " " + sign(re) + "= " + h2_re(h,m) + ";\n"
686  reconstruct_flv2 += out2_im(s, m) + " " + sign(re) + "= " + h2_im(h,m) + ";\n"
687  elif re == 0:
688  reconstruct_flv2 += out2_re(s, m) + " " + sign(-im) + "= " + h2_im(h,m) + ";\n"
689  reconstruct_flv2 += out2_im(s, m) + " " + sign(+im) + "= " + h2_re(h,m) + ";\n"
690 
691  reconstruct_flv2 += "\n"
692 
693 
694  if dir >= 6:
695  str += decl_half
696  str += "if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)\n"
697  str += block("{\n" + load_half_cond + prep_face_flv1 + prep_half + ident + reconstruct_flv1 + "}\n" + "{\n" + load_half_cond + prep_face_flv2 + prep_half + ident + reconstruct_flv2 + "}\n")
698  str += " else "
699  str += block(load_gauge + reconstruct_gauge + "{\n"+ load_half_cond + prep_face_flv1 + prep_half + mult + reconstruct_flv1 + "}\n" + "{\n" + load_half_cond + prep_face_flv2 + prep_half + mult + reconstruct_flv2 +"}\n")
700  else:
701  str += decl_half + load_gauge + reconstruct_gauge
702  str +="{\n" + load_half_cond + prep_face_flv1 + prep_half + mult + reconstruct_flv1 + "}\n"
703  str +="{\n" + load_half_cond + prep_face_flv2 + prep_half + mult + reconstruct_flv2 + "}\n"
704 
705  if pack_only:
706  out = load_spinor + decl_half + project
707  out = out.replace("sp_idx", "idx")
708  return out
709  else:
710  return cond + block(str)+"\n\n"
711 # end def gen
712 
713 
714 def twisted():
715 
716  str = ""
717  str += "#ifdef SPINOR_DOUBLE\n"
718  str += "const spinorFloat a = param.a;\n"
719  str += "const spinorFloat b = param.b;\n"
720  str += "#else\n"
721  str += "const spinorFloat a = param.a_f;\n"
722  str += "const spinorFloat b = param.b_f;\n"
723  str += "#endif\n"
724 
725  str += "//Perform twist rotation first:\n"
726  if dagger :
727  str += "//(1 + i*a*gamma_5 * tau_3 + b * tau_1)\n"
728  else:
729  str += "//(1 - i*a*gamma_5 * tau_3 + b * tau_1)\n"
730  str += "volatile spinorFloat x1_re, x1_im, y1_re, y1_im;\n"
731  str += "volatile spinorFloat x2_re, x2_im, y2_re, y2_im;\n\n"
732 
733  str += "x1_re = 0.0, x1_im = 0.0;\n"
734  str += "y1_re = 0.0, y1_im = 0.0;\n"
735  str += "x2_re = 0.0, x2_im = 0.0;\n"
736  str += "y2_re = 0.0, y2_im = 0.0;\n\n\n"
737 
738  a1 = ""
739  a2 = ""
740 
741  if dagger :
742  a1 += " - a *"
743  a2 += " + a *"
744  else:
745  a1 += " + a *"
746  a2 += " - a *"
747 
748  for c in range(0,3):
749  for h in range(0,2):
750  #h, h+2
751  str += "// using o1 regs:\n"
752  str += "x1_re = " + out1_re(h,c) + a1 + out1_im(h+2,c) + ";\n"
753  str += "x1_im = " + out1_im(h,c) + a2 + out1_re(h+2,c) + ";\n"
754  str += "x2_re = " + "b * " + out1_re(h,c) + ";\n"
755  str += "x2_im = " + "b * " + out1_im(h,c) + ";\n\n"
756  str += "y1_re = " + out1_re(h+2,c) + a1 + out1_im(h,c) + ";\n"
757  str += "y1_im = " + out1_im(h+2,c) + a2 + out1_re(h,c) + ";\n"
758  str += "y2_re = " + "b * " + out1_re(h+2,c) + ";\n"
759  str += "y2_im = " + "b * " + out1_im(h+2,c) + ";\n\n\n"
760  str += "// using o2 regs:\n"
761  str += "x2_re += " + out2_re(h,c) + a2 + out2_im(h+2,c) + ";\n"
762  str += "x2_im += " + out2_im(h,c) + a1 + out2_re(h+2,c) + ";\n"
763  str += "x1_re += " + "b * " + out2_re(h,c) + ";\n"
764  str += "x1_im += " + "b * " + out2_im(h,c) + ";\n\n"
765  str += "y2_re += " + out2_re(h+2,c) + a2 + out2_im(h,c) + ";\n"
766  str += "y2_im += " + out2_im(h+2,c) + a1 + out2_re(h,c) + ";\n"
767  str += "y1_re += " + "b * " + out2_re(h+2,c) + ";\n"
768  str += "y1_im += " + "b * " + out2_im(h+2,c) + ";\n"
769  str += "\n\n"
770  str += out1_re(h,c) + " = x1_re; " + out1_im(h,c) + " = x1_im;\n"
771  str += out1_re(h+2,c) + " = y1_re; " + out1_im(h+2,c) + " = y1_im;\n"
772  str += "\n"
773  str += out2_re(h,c) + " = x2_re; " + out2_im(h,c) + " = x2_im;\n"
774  str += out2_re(h+2,c) + " = y2_re; " + out2_im(h+2,c) + " = y2_im;\n\n"
775  #str += "#endif\n"
776 
777  return "#ifdef DSLASH_TWIST\n" + block(str) + "\n#endif\n"
778 # end def twisted
779 
780 
781 def xpay():
782 
783  str = "\n"
784  str += "#if !defined(DSLASH_XPAY) || defined(DSLASH_TWIST)\n"
785  str += "#ifdef SPINOR_DOUBLE\n"
786  str += "const spinorFloat c = param.c;\n"
787  str += "#else\n"
788  str += "const spinorFloat c = param.c_f;\n"
789  str += "#endif\n"
790  str += "#endif\n"
791 
792  str += "#ifndef DSLASH_XPAY\n"
793 
794  for s in range(0,4):
795  for c in range(0,3):
796  i = 3*s+c
797  str += out1_re(s,c) +" *= c;\n"
798  str += out1_im(s,c) +" *= c;\n"
799  str += "\n"
800 
801  for s in range(0,4):
802  for c in range(0,3):
803  i = 3*s+c
804  str += out2_re(s,c) +" *= c;\n"
805  str += out2_im(s,c) +" *= c;\n"
806 
807 
808  str += "#else\n"
809 
810  str += "#ifdef DSLASH_TWIST\n"
811  str += "// accum spinor\n"
812  str += "#ifdef SPINOR_DOUBLE\n"
813  str += "\n"
814  for s in range(0,4):
815  for c in range(0,3):
816  i = 3*s+c
817  str += "#define "+acc_re(s,c)+" accum"+nthFloat2(2*i+0)+"\n"
818  str += "#define "+acc_im(s,c)+" accum"+nthFloat2(2*i+1)+"\n"
819  str += "\n"
820  str += "#else\n"
821  for s in range(0,4):
822  for c in range(0,3):
823  i = 3*s+c
824  str += "#define "+acc_re(s,c)+" accum"+nthFloat4(2*i+0)+"\n"
825  str += "#define "+acc_im(s,c)+" accum"+nthFloat4(2*i+1)+"\n"
826  str += "\n"
827  str += "#endif // SPINOR_DOUBLE\n\n"
828  str += "{\n"
829  str += " READ_ACCUM(ACCUMTEX, param.sp_stride)\n\n"
830  for s in range(0,4):
831  for c in range(0,3):
832  i = 3*s+c
833  str += " " + out1_re(s,c) +" = c*"+out1_re(s,c)+ " + "+ acc_re(s,c)+";\n"
834  str += " " + out1_im(s,c) +" = c*"+out1_im(s,c)+ " + "+ acc_im(s,c)+";\n"
835  str += "\n"
836  str += " ASSN_ACCUM(ACCUMTEX, param.sp_stride, param.fl_stride)\n\n"
837  for s in range(0,4):
838  for c in range(0,3):
839  i = 3*s+c
840  str += " " + out2_re(s,c) +" = c*"+out2_re(s,c)+ " + "+ acc_re(s,c)+";\n"
841  str += " " + out2_im(s,c) +" = c*"+out2_im(s,c)+ " + "+ acc_im(s,c)+";\n"
842  str += "}\n"
843  str += "\n"
844  for s in range(0,4):
845  for c in range(0,3):
846  i = 3*s+c
847  str += "#undef "+acc_re(s,c)+"\n"
848  str += "#undef "+acc_im(s,c)+"\n"
849  str += "\n"
850  str += "#else\n"
851 
852  str += "// accum spinor\n"
853  str += "#ifdef SPINOR_DOUBLE\n"
854  str += "\n"
855  for s in range(0,4):
856  for c in range(0,3):
857  i = 3*s+c
858  str += "#define "+acc1_re(s,c)+" flv1_accum"+nthFloat2(2*i+0)+"\n"
859 
860  str += "#define "+acc1_im(s,c)+" flv1_accum"+nthFloat2(2*i+1)+"\n"
861  str += "\n"
862  for s in range(0,4):
863  for c in range(0,3):
864  i = 3*s+c
865  str += "#define "+acc2_re(s,c)+" flv2_accum"+nthFloat2(2*i+0)+"\n"
866  str += "#define "+acc2_im(s,c)+" flv2_accum"+nthFloat2(2*i+1)+"\n"
867  str += "\n"
868  str += "#else\n"
869  str += "\n"
870  for s in range(0,4):
871  for c in range(0,3):
872  i = 3*s+c
873  str += "#define "+acc1_re(s,c)+" flv1_accum"+nthFloat4(2*i+0)+"\n"
874  str += "#define "+acc1_im(s,c)+" flv1_accum"+nthFloat4(2*i+1)+"\n"
875  str += "\n"
876  for s in range(0,4):
877  for c in range(0,3):
878  i = 3*s+c
879  str += "#define "+acc2_re(s,c)+" flv2_accum"+nthFloat4(2*i+0)+"\n"
880  str += "#define "+acc2_im(s,c)+" flv2_accum"+nthFloat4(2*i+1)+"\n"
881  str += "\n"
882  str += "#endif // SPINOR_DOUBLE\n\n"
883 
884  str += "{\n"
885 
886  str += " READ_ACCUM_FLAVOR(ACCUMTEX, param.sp_stride, param.fl_stride)\n\n"
887 
888  str += "#ifdef SPINOR_DOUBLE\n"
889  str += "const spinorFloat a = param.a;\n"
890  str += "const spinorFloat b = param.b;\n"
891  str += "#else\n"
892  str += "const spinorFloat a = param.a_f;\n"
893  str += "const spinorFloat b = param.b_f;\n"
894  str += "#endif\n"
895 
896  str += " //Perform twist rotation:\n"
897  if dagger :
898  str += "//(1 + i*a*gamma_5 * tau_3 + b * tau_1)\n"
899  else:
900  str += "//(1 - i*a*gamma_5 * tau_3 + b * tau_1)\n"
901  str += " volatile spinorFloat x1_re, x1_im, y1_re, y1_im;\n"
902  str += " volatile spinorFloat x2_re, x2_im, y2_re, y2_im;\n\n"
903 
904  str += " x1_re = 0.0, x1_im = 0.0;\n"
905  str += " y1_re = 0.0, y1_im = 0.0;\n"
906  str += " x2_re = 0.0, x2_im = 0.0;\n"
907  str += " y2_re = 0.0, y2_im = 0.0;\n\n\n"
908 
909  a1 = ""
910  a2 = ""
911 
912  if dagger :
913  a1 += " - a *"
914  a2 += " + a *"
915  else:
916  a1 += " + a *"
917  a2 += " - a *"
918 
919  for c in range(0,3):
920  for h in range(0,2):
921  #h, h+2
922  str += " // using acc1 regs:\n"
923  str += " x1_re = " + acc1_re(h,c) + a1 + acc1_im(h+2,c) + ";\n"
924  str += " x1_im = " + acc1_im(h,c) + a2 + acc1_re(h+2,c) + ";\n"
925  str += " x2_re = " + "b * " + acc1_re(h,c) + ";\n"
926  str += " x2_im = " + "b * " + acc1_im(h,c) + ";\n\n"
927  str += " y1_re = " + acc1_re(h+2,c) + a1 + acc1_im(h,c) + ";\n"
928  str += " y1_im = " + acc1_im(h+2,c) + a2 + acc1_re(h,c) + ";\n"
929  str += " y2_re = " + "b * " + acc1_re(h+2,c) + ";\n"
930  str += " y2_im = " + "b * " + acc1_im(h+2,c) + ";\n\n\n"
931  str += " // using acc2 regs:\n"
932  str += " x2_re += " + acc2_re(h,c) + a2 + acc2_im(h+2,c) + ";\n"
933  str += " x2_im += " + acc2_im(h,c) + a1 + acc2_re(h+2,c) + ";\n"
934  str += " x1_re += " + "b * " + acc2_re(h,c) + ";\n"
935  str += " x1_im += " + "b * " + acc2_im(h,c) + ";\n\n"
936  str += " y2_re += " + acc2_re(h+2,c) + a2 + acc2_im(h,c) + ";\n"
937  str += " y2_im += " + acc2_im(h+2,c) + a1 + acc2_re(h,c) + ";\n"
938  str += " y1_re += " + "b * " + acc2_re(h+2,c) + ";\n"
939  str += " y1_im += " + "b * " + acc2_im(h+2,c) + ";\n"
940  str += "\n\n"
941  str += acc1_re(h,c) + " = x1_re; " + acc1_im(h,c) + " = x1_im;\n"
942  str += acc1_re(h+2,c) + " = y1_re; " + acc1_im(h+2,c) + " = y1_im;\n"
943  str += "\n"
944  str += acc2_re(h,c) + " = x2_re; " + acc2_im(h,c) + " = x2_im;\n"
945  str += acc2_re(h+2,c) + " = y2_re; " + acc2_im(h+2,c) + " = y2_im;\n\n"
946 
947 
948  str += "#ifdef SPINOR_DOUBLE\n"
949  str += "const spinorFloat k = param.d;\n"
950  str += "#else\n"
951  str += "const spinorFloat k = param.d_f;\n"
952  str += "#endif\n"
953 
954  for s in range(0,4):
955  for c in range(0,3):
956  i = 3*s+c
957  str += " " + out1_re(s,c) +" = k*"+out1_re(s,c) + " + "+ acc1_re(s,c)+";\n"
958  str += " " + out1_im(s,c) +" = k*"+out1_im(s,c) + " + "+ acc1_im(s,c)+ ";\n"
959 
960  str += "\n"
961 
962  for s in range(0,4):
963  for c in range(0,3):
964  i = 3*s+c
965  str += " " + out2_re(s,c) +" = k*"+out2_re(s,c) + " + "+ acc2_re(s,c)+ ";\n"
966  str += " " + out2_im(s,c) +" = k*"+out2_im(s,c) + " + "+ acc2_im(s,c)+ ";\n"
967 
968  str += "}\n"
969  str += "\n"
970  for s in range(0,4):
971  for c in range(0,3):
972  i = 3*s+c
973  str += "#undef "+acc1_re(s,c)+"\n"
974  str += "#undef "+acc1_im(s,c)+"\n"
975  str += "\n"
976  for s in range(0,4):
977  for c in range(0,3):
978  i = 3*s+c
979  str += "#undef "+acc2_re(s,c)+"\n"
980  str += "#undef "+acc2_im(s,c)+"\n"
981  str += "\n"
982  str += "#endif//DSLASH_TWIST\n"
983  str += "\n"
984  str += "#endif // DSLASH_XPAY\n"
985 
986  return str
987 # end def xpay
988 
989 
990 def epilog():
991  str = ""
992  str += "// apply twisted mass rotation\n"
993  str += block( "\n" + twisted() + xpay() )
994 
995  str += "\n\n"
996  str += "// write spinor field back to device memory\n"
997  str += "WRITE_FLAVOR_SPINOR();\n\n"
998 
999  str += "// undefine to prevent warning when precision is changed\n"
1000  str += "#undef spinorFloat\n"
1001  if sharedDslash:
1002  str += "#undef WRITE_SPINOR_SHARED\n"
1003  str += "#undef READ_SPINOR_SHARED\n"
1004  if sharedFloatsPerFlavor > 0: str += "#undef SHARED_STRIDE\n\n"
1005 
1006  if dslash:
1007  for m in range(0,3):
1008  for n in range(0,3):
1009  i = 3*m+n
1010  str += "#undef "+g_re(0,m,n)+"\n"
1011  str += "#undef "+g_im(0,m,n)+"\n"
1012  str += "\n"
1013 
1014  for s in range(0,4):
1015  for c in range(0,3):
1016  i = 3*s+c
1017  str += "#undef "+in_re(s,c)+"\n"
1018  str += "#undef "+in_im(s,c)+"\n"
1019  str += "\n"
1020 #fixme
1021  for s in range(0,4):
1022  for c in range(0,3):
1023  i = 3*s+c
1024  if 2*i < sharedFloatsPerFlavor:
1025  str += "#undef "+out1_re(s,c)+"\n"
1026  if 2*i+1 < sharedFloatsPerFlavor:
1027  str += "#undef "+out1_im(s,c)+"\n"
1028  str += "\n"
1029 
1030  str += "#undef VOLATILE\n"
1031 
1032  return str
1033 # end def epilog
1034 
1035 
1036 
1038  return prolog() + gen(0) + gen(1) + gen(2) + gen(3) + gen(4) + gen(5) + gen(6) + gen(7) + epilog()
1039 # return prolog() + epilog()
1040 
1041 
1042 # generate Wilson-like Dslash kernels
1044  print "Generating dslash kernel for sm" + str(arch/10)
1045 
1046  global sharedFloatsPerFlavor
1047  global sharedDslash
1048  global dslash
1049  global dagger
1050  global twist
1051 
1052  sharedFloatsPerFlavor = 0
1053  if arch >= 200:
1054  sharedFloatsPerFlavor = 0
1055  #sharedDslash = True
1056  sharedDslash = False
1057  name = "fermi"
1058  elif arch >= 120:
1059  sharedFloatsPerFlavor = 0
1060  sharedDslash = False
1061  name = "gt200"
1062  else:
1063  sharedFloatsPerFlavor = 19
1064  sharedDslash = False
1065  name = "g80"
1066 
1067  print "Shared floats set to " + str(sharedFloatsPerFlavor)
1068 
1069  dslash = True
1070  twist = False
1071  dagger = False
1072 
1073  twist = True
1074  dagger = False
1075  filename = 'dslash_core/tm_ndeg_fused_exterior_dslash_core.h'
1076  print sys.argv[0] + ": generating " + filename;
1077  f = open(filename, 'w')
1078  f.write(generate_dslash())
1079  f.close()
1080 
1081  dagger = True
1082  filename = 'dslash_core/tm_ndeg_fused_exterior_dslash_dagger_core.h'
1083  print sys.argv[0] + ": generating " + filename + "\n";
1084  f = open(filename, 'w')
1085  f.write(generate_dslash())
1086  f.close()
1087 
1088  dslash = False
1089 
1090 
1091 
1092 dslash = False
1093 dagger = False
1094 twist = False
1095 sharedFloatsPerFlavor = 0
1096 sharedDslash = False
1097 
1098 # generate dslash kernels
1099 #arch = 200
1100 #generate_dslash_kernels(arch)
1101 
1102 arch = 200
1104 
1105 #arch = 100
1106 #generate_dslash_kernels(arch)
Definition: gen.py:1
def complexify(a)
complex numbers ######################################################################## ...
def indent(code)
code generation ######################################################################## ...