QUDA  0.9.0
fused_exterior_dw_dslash_4D_cuda_gen.py
Go to the documentation of this file.
1 # -*- coding: utf-8 -*-
2 import sys
3 
4 
5 
6 def complexify(a):
7  return [complex(x) for x in a]
8 
9 def complexToStr(c):
10  def fltToString(a):
11  if a == int(a): return `int(a)`
12  else: return `a`
13 
14  def imToString(a):
15  if a == 0: return "0i"
16  elif a == -1: return "-i"
17  elif a == 1: return "i"
18  else: return fltToString(a)+"i"
19 
20  re = c.real
21  im = c.imag
22  if re == 0 and im == 0: return "0"
23  elif re == 0: return imToString(im)
24  elif im == 0: return fltToString(re)
25  else:
26  im_str = "-"+imToString(-im) if im < 0 else "+"+imToString(im)
27  return fltToString(re)+im_str
28 
29 
30 
31 
32 id = complexify([
33  1, 0, 0, 0,
34  0, 1, 0, 0,
35  0, 0, 1, 0,
36  0, 0, 0, 1
37 ])
38 
39 gamma1 = complexify([
40  0, 0, 0, 1j,
41  0, 0, 1j, 0,
42  0, -1j, 0, 0,
43  -1j, 0, 0, 0
44 ])
45 
46 gamma2 = complexify([
47  0, 0, 0, 1,
48  0, 0, -1, 0,
49  0, -1, 0, 0,
50  1, 0, 0, 0
51 ])
52 
53 gamma3 = complexify([
54  0, 0, 1j, 0,
55  0, 0, 0, -1j,
56  -1j, 0, 0, 0,
57  0, 1j, 0, 0
58 ])
59 
60 gamma4 = complexify([
61  1, 0, 0, 0,
62  0, 1, 0, 0,
63  0, 0, -1, 0,
64  0, 0, 0, -1
65 ])
66 
67 igamma5 = complexify([
68  0, 0, 1j, 0,
69  0, 0, 0, 1j,
70  1j, 0, 0, 0,
71  0, 1j, 0, 0
72 ])
73 
74 two_P_L = [ id[x] - igamma5[x]/1j for x in range(0,4*4) ]
75 two_P_R = [ id[x] + igamma5[x]/1j for x in range(0,4*4) ]
76 
77 # for s1 in range(0,4) :
78 # for s2 in range (0,4): print "%8s" % two_P_L[s1*4+s2],
79 # print " ",
80 # for s2 in range (0,4): print "%8s" % two_P_R[s1*4+s2],
81 # print ""
82 
83 
84 def gplus(g1, g2):
85  return [x+y for (x,y) in zip(g1,g2)]
86 
87 def gminus(g1, g2):
88  return [x-y for (x,y) in zip(g1,g2)]
89 
91  out = ""
92  for i in range(0, 4):
93  for j in range(0,4):
94  out += '%3s' % complexToStr(p[4*i+j])
95  out += "\n"
96  return out
97 
98 projectors = [
99  gminus(id,gamma1), gplus(id,gamma1),
100  gminus(id,gamma2), gplus(id,gamma2),
101  gminus(id,gamma3), gplus(id,gamma3),
102  gminus(id,gamma4), gplus(id,gamma4),
103 ]
104 
105 
106 
107 def indent(code, n=1):
108  def indentline(line): return (n*" "+line if ( line and line.count("#", 0, 1) == 0) else line)
109  return ''.join([indentline(line)+"\n" for line in code.splitlines()])
110 
111 def block(code):
112  return "{\n"+indent(code)+"}"
113 
114 def sign(x):
115  if x==1: return "+"
116  elif x==-1: return "-"
117  elif x==+2: return "+2*"
118  elif x==-2: return "-2*"
119 
120 def nthFloat4(n):
121  return `(n/4)` + "." + ["x", "y", "z", "w"][n%4]
122 
123 def nthFloat2(n):
124  return `(n/2)` + "." + ["x", "y"][n%2]
125 
126 
127 def in_re(s, c): return "i"+`s`+`c`+"_re"
128 def in_im(s, c): return "i"+`s`+`c`+"_im"
129 def g_re(d, m, n): return ("g" if (d%2==0) else "gT")+`m`+`n`+"_re"
130 def g_im(d, m, n): return ("g" if (d%2==0) else "gT")+`m`+`n`+"_im"
131 def out_re(s, c): return "o"+`s`+`c`+"_re"
132 def out_im(s, c): return "o"+`s`+`c`+"_im"
133 def h1_re(h, c): return ["a","b"][h]+`c`+"_re"
134 def h1_im(h, c): return ["a","b"][h]+`c`+"_im"
135 def h2_re(h, c): return ["A","B"][h]+`c`+"_re"
136 def h2_im(h, c): return ["A","B"][h]+`c`+"_im"
137 def c_re(b, sm, cm, sn, cn): return "c"+`(sm+2*b)`+`cm`+"_"+`(sn+2*b)`+`cn`+"_re"
138 def c_im(b, sm, cm, sn, cn): return "c"+`(sm+2*b)`+`cm`+"_"+`(sn+2*b)`+`cn`+"_im"
139 def a_re(b, s, c): return "a"+`(s+2*b)`+`c`+"_re"
140 def a_im(b, s, c): return "a"+`(s+2*b)`+`c`+"_im"
141 
142 def tmp_re(s, c): return "tmp"+`s`+`c`+"_re"
143 def tmp_im(s, c): return "tmp"+`s`+`c`+"_im"
144 
145 
147  str = ""
148  str += "// input spinor\n"
149  str += "#ifdef SPINOR_DOUBLE\n"
150  str += "#define spinorFloat double\n"
151  for s in range(0,4):
152  for c in range(0,3):
153  i = 3*s+c
154  str += "#define "+in_re(s,c)+" I"+nthFloat2(2*i+0)+"\n"
155  str += "#define "+in_im(s,c)+" I"+nthFloat2(2*i+1)+"\n"
156  str += "#define m5 param.m5_d\n"
157  str += "#define mdwf_b5 param.mdwf_b5_d\n"
158  str += "#define mdwf_c5 param.mdwf_c5_d\n"
159  str += "#define mferm param.mferm\n"
160  str += "#define a param.a\n"
161  str += "#define b param.b\n"
162  str += "#else\n"
163  str += "#define spinorFloat float\n"
164  for s in range(0,4):
165  for c in range(0,3):
166  i = 3*s+c
167  str += "#define "+in_re(s,c)+" I"+nthFloat4(2*i+0)+"\n"
168  str += "#define "+in_im(s,c)+" I"+nthFloat4(2*i+1)+"\n"
169  str += "#define m5 param.m5_f\n"
170  str += "#define mdwf_b5 param.mdwf_b5_f\n"
171  str += "#define mdwf_c5 param.mdwf_c5_f\n"
172  str += "#define mferm param.mferm_f\n"
173  str += "#define a param.a_f\n"
174  str += "#define b param.b_f\n"
175  str += "#endif // SPINOR_DOUBLE\n\n"
176  return str
177 # end def def_input_spinor
178 
179 
180 def def_gauge():
181  str = "// gauge link\n"
182  str += "#ifdef GAUGE_FLOAT2\n"
183  for m in range(0,3):
184  for n in range(0,3):
185  i = 3*m+n
186  str += "#define "+g_re(0,m,n)+" G"+nthFloat2(2*i+0)+"\n"
187  str += "#define "+g_im(0,m,n)+" G"+nthFloat2(2*i+1)+"\n"
188 
189  str += "\n"
190  str += "#else\n"
191  for m in range(0,3):
192  for n in range(0,3):
193  i = 3*m+n
194  str += "#define "+g_re(0,m,n)+" G"+nthFloat4(2*i+0)+"\n"
195  str += "#define "+g_im(0,m,n)+" G"+nthFloat4(2*i+1)+"\n"
196 
197  str += "\n"
198  str += "#endif // GAUGE_DOUBLE\n\n"
199 
200  str += "// conjugated gauge link\n"
201  for m in range(0,3):
202  for n in range(0,3):
203  i = 3*m+n
204  str += "#define "+g_re(1,m,n)+" (+"+g_re(0,n,m)+")\n"
205  str += "#define "+g_im(1,m,n)+" (-"+g_im(0,n,m)+")\n"
206  str += "\n"
207 
208  return str
209 # end def def_gauge
210 
211 
213  str = "// first chiral block of inverted clover term\n"
214  str += "#ifdef CLOVER_DOUBLE\n"
215  i = 0
216  for m in range(0,6):
217  s = m/3
218  c = m%3
219  str += "#define "+c_re(0,s,c,s,c)+" C"+nthFloat2(i)+"\n"
220  i += 1
221  for n in range(0,6):
222  sn = n/3
223  cn = n%3
224  for m in range(n+1,6):
225  sm = m/3
226  cm = m%3
227  str += "#define "+c_re(0,sm,cm,sn,cn)+" C"+nthFloat2(i)+"\n"
228  str += "#define "+c_im(0,sm,cm,sn,cn)+" C"+nthFloat2(i+1)+"\n"
229  i += 2
230  str += "#else\n"
231  i = 0
232  for m in range(0,6):
233  s = m/3
234  c = m%3
235  str += "#define "+c_re(0,s,c,s,c)+" C"+nthFloat4(i)+"\n"
236  i += 1
237  for n in range(0,6):
238  sn = n/3
239  cn = n%3
240  for m in range(n+1,6):
241  sm = m/3
242  cm = m%3
243  str += "#define "+c_re(0,sm,cm,sn,cn)+" C"+nthFloat4(i)+"\n"
244  str += "#define "+c_im(0,sm,cm,sn,cn)+" C"+nthFloat4(i+1)+"\n"
245  i += 2
246  str += "#endif // CLOVER_DOUBLE\n\n"
247 
248  for n in range(0,6):
249  sn = n/3
250  cn = n%3
251  for m in range(0,n):
252  sm = m/3
253  cm = m%3
254  str += "#define "+c_re(0,sm,cm,sn,cn)+" (+"+c_re(0,sn,cn,sm,cm)+")\n"
255  str += "#define "+c_im(0,sm,cm,sn,cn)+" (-"+c_im(0,sn,cn,sm,cm)+")\n"
256  str += "\n"
257 
258  str += "// second chiral block of inverted clover term (reuses C0,...,C9)\n"
259  for n in range(0,6):
260  sn = n/3
261  cn = n%3
262  for m in range(0,6):
263  sm = m/3
264  cm = m%3
265  str += "#define "+c_re(1,sm,cm,sn,cn)+" "+c_re(0,sm,cm,sn,cn)+"\n"
266  if m != n: str += "#define "+c_im(1,sm,cm,sn,cn)+" "+c_im(0,sm,cm,sn,cn)+"\n"
267  str += "\n"
268 
269  return str
270 # end def def_clover
271 
273  str = "// output spinor\n"
274  for s in range(0,4):
275  for c in range(0,3):
276  i = 3*s+c
277  if 2*i < sharedFloats:
278  str += "#define "+out_re(s,c)+" s["+`(2*i+0)`+"*SHARED_STRIDE]\n"
279  else:
280  str += "VOLATILE spinorFloat "+out_re(s,c)+";\n"
281  if 2*i+1 < sharedFloats:
282  str += "#define "+out_im(s,c)+" s["+`(2*i+1)`+"*SHARED_STRIDE]\n"
283  else:
284  str += "VOLATILE spinorFloat "+out_im(s,c)+";\n"
285  return str
286 # end def def_output_spinor
287 
288 
289 def prolog():
290  prolog_str = ("#ifdef MULTI_GPU\n\n")
291  if dslash:
292  prolog_str+= ("// *** CUDA DSLASH ***\n\n" if not dagger else "// *** CUDA DSLASH DAGGER ***\n\n")
293  prolog_str+= "#define DSLASH_SHARED_FLOATS_PER_THREAD "+str(sharedFloats)+"\n\n"
294  elif clover:
295  prolog_str= ("// *** CUDA CLOVER ***\n\n")
296  prolog_str+= "#define CLOVER_SHARED_FLOATS_PER_THREAD "+str(sharedFloats)+"\n\n"
297  else:
298  print "Undefined prolog"
299  exit
300 
301 
302  prolog_str+= (
303 """
304 #if (CUDA_VERSION >= 4010)
305 #define VOLATILE
306 #else
307 #define VOLATILE volatile
308 #endif
309 """)
310 
311  prolog_str+= def_input_spinor()
312  if dslash == True:
313  if dslash4 == True:
314  prolog_str+= def_gauge()
315  if clover == True: prolog_str+= def_clover()
316  prolog_str+= def_output_spinor()
317 
318  prolog_str+= (
319 """
320 #ifdef SPINOR_DOUBLE
321 #if (__COMPUTE_CAPABILITY__ >= 200)
322 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
323 #else
324 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
325 #endif
326 #else
327 #if (__COMPUTE_CAPABILITY__ >= 200)
328 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
329 #else
330 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
331 #endif
332 #endif
333 """)
334 
335  if sharedFloats > 0:
336  prolog_str += (
337 """
338 extern __shared__ char s_data[];
339 """)
340 
341  if dslash:
342  prolog_str += (
343 """
344 VOLATILE spinorFloat *s = (spinorFloat*)s_data + DSLASH_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
345 + (threadIdx.x % SHARED_STRIDE);
346 """)
347  else:
348  prolog_str += (
349 """
350 VOLATILE spinorFloat *s = (spinorFloat*)s_data + CLOVER_SHARED_FLOATS_PER_THREAD*SHARED_STRIDE*(threadIdx.x/SHARED_STRIDE)
351 + (threadIdx.x % SHARED_STRIDE);
352 """)
353 
354 
355  if dslash:
356  if dslash4 == True:
357  prolog_str += "\n#include \"read_gauge.h\"\n"
358  if not domain_wall:
359  prolog_str += "#include \"read_clover.h\"\n"
360  prolog_str += "#include \"io_spinor.h\"\n"
361  if dslash4 == True:
362  prolog_str += (
363 """
364 #if (DD_PREC==2) // half precision
365 int sp_norm_idx;
366 #endif // half precision
367 """)
368  prolog_str += (
369 """
370 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
371 if (sid >= param.threads*param.dc.Ls) return;
372 
373 int dim;
374 int face_idx;
375 
376 
377 """)
378  if domain_wall:
379  if dslash4 == True:
380  prolog_str+=(
381 """
382 int coord[5];
383 int X;
384 """)
385  else:
386  prolog_str+=(
387 """
388 int coord[5];
389 """)
390  else:
391  prolog_str+=(
392 """
393 int coord[5];
394 int X;
395 """)
396 
397 
398 
399  if dslash4 == True:
400  prolog_str+= (
401 """
402 { // exterior kernel
403 
404 dim = dimFromFaceIndex<5>(sid, param); // sid is also modified
405 
406 //const int face_volume = (param.threads*param.dc.Ls >> 1); // volume of one face
407 const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim])*param.dc.Ls >> 1);
408 
409 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
410 face_idx = sid - face_num*face_volume; // index into the respective face
411 
412 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
413 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
414 //sp_idx = face_idx + param.ghostOffset[dim];
415 
416 switch(dim) {
417 case 0:
418  coordsFromFaceIndex<5,QUDA_4D_PC,0,1>(X, sid, coord, face_idx, face_num, param);
419  break;
420 case 1:
421  coordsFromFaceIndex<5,QUDA_4D_PC,1,1>(X, sid, coord, face_idx, face_num, param);
422  break;
423 case 2:
424  coordsFromFaceIndex<5,QUDA_4D_PC,2,1>(X, sid, coord, face_idx, face_num, param);
425  break;
426 case 3:
427  coordsFromFaceIndex<5,QUDA_4D_PC,3,1>(X, sid, coord, face_idx, face_num, param);
428  break;
429 }
430 
431 {
432  bool active = false;
433  for(int dir=0; dir<4; ++dir){
434  active = active || isActive(dim,dir,+1,coord,param.commDim,param.dc.X);
435  }
436  if(!active) return;
437 }
438 
439 
440 
441 READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
442 """)
443  out = ""
444  for s in range(0,4):
445  for c in range(0,3):
446  out += out_re(s,c)+" = "+in_re(s,c)+"; "+out_im(s,c)+" = "+in_im(s,c)+";\n"
447  prolog_str+= indent(out)
448  prolog_str+= "}\n"
449 
450  if domain_wall:
451  if dslash4 == True:
452  prolog_str += (
453 """
454 // declare G## here and use ASSN below instead of READ
455 #ifdef GAUGE_FLOAT2
456 #if (DD_PREC==0) //temporal hack
457 double2 G0;
458 double2 G1;
459 double2 G2;
460 double2 G3;
461 double2 G4;
462 double2 G5;
463 double2 G6;
464 double2 G7;
465 double2 G8;
466 #else
467 float2 G0;
468 float2 G1;
469 float2 G2;
470 float2 G3;
471 float2 G4;
472 float2 G5;
473 float2 G6;
474 float2 G7;
475 float2 G8;
476 #endif
477 #else
478 float4 G0;
479 float4 G1;
480 float4 G2;
481 float4 G3;
482 float4 G4;
483 #endif
484 
485 """)
486 
487  prolog_str+= "\n\n"
488 
489  else:
490  prolog_str+=(
491 """
492 #include "read_clover.h"
493 #include "io_spinor.h"
494 
495 int sid = blockIdx.x*blockDim.x + threadIdx.x;
496 if (sid >= param.threads) return;
497 
498 // read spinor from device memory
499 READ_SPINOR(SPINORTEX, param.sp_stride, sid, sid);
500 
501 """)
502  return prolog_str
503 # end def prolog
504 
505 
506 def gen(dir, pack_only=False):
507  projIdx = dir if not dagger else dir + ( +1 if dir%2 == 0 else -1 )
508  projStr = projectorToStr(projectors[projIdx])
509  def proj(i,j):
510  return projectors[projIdx][4*i+j]
511 
512  # if row(i) = (j, c), then the i'th row of the projector can be represented
513  # as a multiple of the j'th row: row(i) = c row(j)
514  def row(i):
515  assert i==2 or i==3
516  if proj(i,0) == 0j:
517  return (1, proj(i,1))
518  if proj(i,1) == 0j:
519  return (0, proj(i,0))
520 
521  boundary = ["coord[0]==(param.dc.X[0]-1)", "coord[0]==0", "coord[1]==(param.dc.X[1]-1)", "coord[1]==0", "coord[2]==(param.dc.X[2]-1)", "coord[2]==0", "coord[3]==(param.dc.X[3]-1)", "coord[3]==0"]
522  interior = ["coord[0]<(param.dc.X[0]-1)", "coord[0]>0", "coord[1]<(param.dc.X[1]-1)", "coord[1]>0", "coord[2]<(param.dc.X[2]-1)", "coord[2]>0", "coord[3]<(param.dc.X[3]-1)", "coord[3]>0"]
523 
524  offset = ["+1", "-1", "+1", "-1", "+1", "-1", "+1", "-1"]
525 
526  dim = ["X", "Y", "Z", "T"]
527 
528  # index of neighboring site when not on boundary
529  sp_idx = ["X+1", "X-1", "X+param.dc.X[0]", "X-param.dc.X[0]", "X+param.dc.X2X1", "X-param.dc.X2X1", "X+param.dc.X3X2X1", "X-param.dc.X3X2X1"]
530 
531  # index of neighboring site (across boundary)
532  sp_idx_wrap = ["X-(param.dc.X[0]-1)", "X+(param.dc.X[0]-1)", "X-param.dc.X2X1mX1", "X+param.dc.X2X1mX1", "X-param.dc.X3X2X1mX2X1", "X+param.dc.X3X2X1mX2X1",
533  "X-param.dc.X4X3X2X1mX3X2X1", "X+param.dc.X4X3X2X1mX3X2X1"]
534 
535  cond = ""
536  cond += "if (isActive(dim," + `dir/2` + "," + offset[dir] + ",coord,param.commDim,param.dc.X) && " + boundary[dir] + " )\n"
537 
538 
539  str = ""
540 
541  projName = "P"+`dir/2`+["-","+"][projIdx%2]
542  str += "// Projector "+projName+"\n"
543  for l in projStr.splitlines():
544  str += "//"+l+"\n"
545  str += "\n"
546 
547  str += "faceIndexFromCoords<5,1>(face_idx,coord," + `dir/2` + ",param);\n"
548  str += "const int sp_idx = face_idx + param.ghostOffset[" + `dir/2` + "][" + `1-dir%2` + "];\n"
549  str += "#if (DD_PREC==2) // half precision\n"
550  str += " sp_norm_idx = face_idx + "
551 # if dir%2 == 0:
552 # str += "param.dc.Ls*param.dc.ghostFace[" + `dir/2` + "] + "
553  str += "param.ghostNormOffset[" + `dir/2` + "][" + `1-dir%2` + "];\n"
554  str += "#endif\n\n"
555  str += "\n"
556 
557 
558 
559  if dir % 2 == 0:
560  if domain_wall: str += "const int ga_idx = sid % param.dc.volume_4d_cb;\n"
561  else: str += "const int ga_idx = sid;\n"
562  else:
563  if domain_wall: str += "const int ga_idx = param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[" + `dir/2` + "]);\n"
564  else: str += "const int ga_idx = param.dc.volume_4d_cb+face_idx;\n"
565  str += "\n"
566 
567  # scan the projector to determine which loads are required
568  row_cnt = ([0,0,0,0])
569  for h in range(0,4):
570  for s in range(0,4):
571  re = proj(h,s).real
572  im = proj(h,s).imag
573  if re != 0 or im != 0:
574  row_cnt[h] += 1
575  row_cnt[0] += row_cnt[1]
576  row_cnt[2] += row_cnt[3]
577 
578  decl_half = ""
579  for h in range(0, 2):
580  for c in range(0, 3):
581  decl_half += "spinorFloat "+h1_re(h,c)+", "+h1_im(h,c)+";\n";
582  decl_half += "\n"
583 
584  load_spinor = "// read spinor from device memory\n"
585  if row_cnt[0] == 0:
586  load_spinor += "READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n"
587  elif row_cnt[2] == 0:
588  load_spinor += "READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n"
589  else:
590  load_spinor += "READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);\n"
591  load_spinor += "\n"
592 
593  load_half = ""
594  if domain_wall :
595  load_half += "const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[" + `dir/2` + "];\n"
596  else :
597  load_half += "const int sp_stride_pad = param.dc.ghostFace[" + `dir/2` + "];\n"
598 
599  if dir >= 6: load_half += "const int t_proj_scale = TPROJSCALE;\n"
600  load_half += "\n"
601  load_half += "// read half spinor from device memory\n"
602 
603 # we have to use the same volume index for backwards and forwards gathers
604 # instead of using READ_UP_SPINOR and READ_DOWN_SPINOR, just use READ_HALF_SPINOR with the appropriate shift
605  load_half += "READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, "+`dir`+");\n\n"
606 # if (dir+1) % 2 == 0: load_half += "READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);\n\n"
607 # else: load_half += "READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);\n\n"
608  load_gauge = "// read gauge matrix from device memory\n"
609  load_gauge += "ASSN_GAUGE_MATRIX(G, GAUGE"+`( dir%2)`+"TEX, "+`dir`+", ga_idx, param.gauge_stride);\n\n"
610 
611  reconstruct_gauge = "// reconstruct gauge matrix\n"
612  reconstruct_gauge += "RECONSTRUCT_GAUGE_MATRIX("+`dir`+");\n\n"
613 
614  project = "// project spinor into half spinors\n"
615  for h in range(0, 2):
616  for c in range(0, 3):
617  strRe = ""
618  strIm = ""
619  for s in range(0, 4):
620  re = proj(h,s).real
621  im = proj(h,s).imag
622  if re==0 and im==0: ()
623  elif im==0:
624  strRe += sign(re)+in_re(s,c)
625  strIm += sign(re)+in_im(s,c)
626  elif re==0:
627  strRe += sign(-im)+in_im(s,c)
628  strIm += sign(im)+in_re(s,c)
629  if row_cnt[0] == 0: # projector defined on lower half only
630  for s in range(0, 4):
631  re = proj(h+2,s).real
632  im = proj(h+2,s).imag
633  if re==0 and im==0: ()
634  elif im==0:
635  strRe += sign(re)+in_re(s,c)
636  strIm += sign(re)+in_im(s,c)
637  elif re==0:
638  strRe += sign(-im)+in_im(s,c)
639  strIm += sign(im)+in_re(s,c)
640 
641  project += h1_re(h,c)+" = "+strRe+";\n"
642  project += h1_im(h,c)+" = "+strIm+";\n"
643 
644  copy_half = ""
645  for h in range(0, 2):
646  for c in range(0, 3):
647  copy_half += h1_re(h,c)+" = "+("t_proj_scale*" if (dir >= 6) else "")+in_re(h,c)+"; "
648  copy_half += h1_im(h,c)+" = "+("t_proj_scale*" if (dir >= 6) else "")+in_im(h,c)+";\n"
649  copy_half += "\n"
650 
651  prep_half = ""
652  prep_half += "\n"
653  prep_half += "\n"
654  prep_half += indent(load_half)
655  prep_half += indent(copy_half)
656  prep_half += "\n"
657 
658  ident = "// identity gauge matrix\n"
659  for m in range(0,3):
660  for h in range(0,2):
661  ident += "spinorFloat "+h2_re(h,m)+" = " + h1_re(h,m) + "; "
662  ident += "spinorFloat "+h2_im(h,m)+" = " + h1_im(h,m) + ";\n"
663  ident += "\n"
664 
665  mult = ""
666  for m in range(0,3):
667  mult += "// multiply row "+`m`+"\n"
668  for h in range(0,2):
669  re = "spinorFloat "+h2_re(h,m)+" = 0;\n"
670  im = "spinorFloat "+h2_im(h,m)+" = 0;\n"
671  for c in range(0,3):
672  re += h2_re(h,m) + " += " + g_re(dir,m,c) + " * "+h1_re(h,c)+";\n"
673  re += h2_re(h,m) + " -= " + g_im(dir,m,c) + " * "+h1_im(h,c)+";\n"
674  im += h2_im(h,m) + " += " + g_re(dir,m,c) + " * "+h1_im(h,c)+";\n"
675  im += h2_im(h,m) + " += " + g_im(dir,m,c) + " * "+h1_re(h,c)+";\n"
676  mult += re + im
677  mult += "\n"
678 
679  reconstruct = ""
680  for m in range(0,3):
681 
682  for h in range(0,2):
683  h_out = h
684  if row_cnt[0] == 0: # projector defined on lower half only
685  h_out = h+2
686  reconstruct += out_re(h_out, m) + " += " + h2_re(h,m) + ";\n"
687  reconstruct += out_im(h_out, m) + " += " + h2_im(h,m) + ";\n"
688 
689  for s in range(2,4):
690  (h,c) = row(s)
691  re = c.real
692  im = c.imag
693  if im == 0 and re == 0:
694  ()
695  elif im == 0:
696  reconstruct += out_re(s, m) + " " + sign(re) + "= " + h2_re(h,m) + ";\n"
697  reconstruct += out_im(s, m) + " " + sign(re) + "= " + h2_im(h,m) + ";\n"
698  elif re == 0:
699  reconstruct += out_re(s, m) + " " + sign(-im) + "= " + h2_im(h,m) + ";\n"
700  reconstruct += out_im(s, m) + " " + sign(+im) + "= " + h2_re(h,m) + ";\n"
701 
702  if ( m < 2 ): reconstruct += "\n"
703 
704  if dir >= 6:
705  str += "if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)\n"
706  str += block(decl_half + prep_half + ident + reconstruct)
707  str += " else "
708  str += block(load_gauge + decl_half + prep_half + reconstruct_gauge + mult + reconstruct)
709  else:
710  str += load_gauge + decl_half + prep_half + reconstruct_gauge + mult + reconstruct
711 
712  if pack_only:
713  out = load_spinor + decl_half + project
714  out = out.replace("sp_idx", "idx")
715  return out
716  else:
717  return cond + block(str)+"\n\n"
718 # end def gen
719 
720 
722 
723  str = "\n"
724  str += "VOLATILE spinorFloat kappa;\n\n"
725  str += "#ifdef MDWF_mode // Check whether MDWF option is enabled\n"
726  str += " kappa = (spinorFloat)(-(mdwf_c5[coord[4]]*(4.0 + m5) - 1.0)/(mdwf_b5[coord[4]]*(4.0 + m5) + 1.0));\n"
727  str += "#else\n"
728  str += " kappa = 2.0*a;\n"
729  str += "#endif // select MDWF mode\n\n"
730  str += "// M5_inv operation -- NB: not partitionable!\n\n"
731  str += "// In this part, we will do the following operation in parallel way.\n\n"
732  str += "// w = M5inv * v\n"
733  str += "// 'w' means output vector\n"
734  str += "// 'v' means input vector\n"
735  str += "{\n"
736  str += " int base_idx = sid%param.dc.volume_4d_cb;\n"
737  str += " int sp_idx;\n\n"
738  str += "// let's assume the index,\n"
739  str += "// s = output vector index,\n"
740  str += "// s' = input vector index and\n"
741  str += "// 'a'= kappa5\n"
742  str += "\n"
743  str += " spinorFloat inv_d_n = 1.0 / ( 1.0 + pow(kappa,param.dc.Ls)*mferm);\n"
744  str += " spinorFloat factorR;\n"
745  str += " spinorFloat factorL;\n"
746  str += "\n"
747  str += " for(int s = 0; s < param.dc.Ls; s++)\n {\n"
748  if dagger == True :
749  str += " factorR = ( coord[4] > s ? -inv_d_n*pow(kappa,param.dc.Ls-coord[4]+s)*mferm : inv_d_n*pow(kappa,s-coord[4]))/2.0;\n\n"
750  else :
751  str += " factorR = ( coord[4] < s ? -inv_d_n*pow(kappa,param.dc.Ls-s+coord[4])*mferm : inv_d_n*pow(kappa,coord[4]-s))/2.0;\n\n"
752  str += " sp_idx = base_idx + s*param.dc.volume_4d_cb;\n"
753  str += " // read spinor from device memory\n"
754  str += " READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );\n\n"
755  str += " o00_re += factorR*(i00_re + i20_re);\n"
756  str += " o00_im += factorR*(i00_im + i20_im);\n"
757  str += " o20_re += factorR*(i00_re + i20_re);\n"
758  str += " o20_im += factorR*(i00_im + i20_im);\n"
759  str += " o01_re += factorR*(i01_re + i21_re);\n"
760  str += " o01_im += factorR*(i01_im + i21_im);\n"
761  str += " o21_re += factorR*(i01_re + i21_re);\n"
762  str += " o21_im += factorR*(i01_im + i21_im);\n"
763  str += " o02_re += factorR*(i02_re + i22_re);\n"
764  str += " o02_im += factorR*(i02_im + i22_im);\n"
765  str += " o22_re += factorR*(i02_re + i22_re);\n"
766  str += " o22_im += factorR*(i02_im + i22_im);\n"
767  str += " o10_re += factorR*(i10_re + i30_re);\n"
768  str += " o10_im += factorR*(i10_im + i30_im);\n"
769  str += " o30_re += factorR*(i10_re + i30_re);\n"
770  str += " o30_im += factorR*(i10_im + i30_im);\n"
771  str += " o11_re += factorR*(i11_re + i31_re);\n"
772  str += " o11_im += factorR*(i11_im + i31_im);\n"
773  str += " o31_re += factorR*(i11_re + i31_re);\n"
774  str += " o31_im += factorR*(i11_im + i31_im);\n"
775  str += " o12_re += factorR*(i12_re + i32_re);\n"
776  str += " o12_im += factorR*(i12_im + i32_im);\n"
777  str += " o32_re += factorR*(i12_re + i32_re);\n"
778  str += " o32_im += factorR*(i12_im + i32_im);\n\n"
779 
780  if dagger == True :
781  str += " factorL = ( coord[4] < s ? -inv_d_n*pow(kappa,param.dc.Ls-s+coord[4])*mferm : inv_d_n*pow(kappa,coord[4]-s))/2.0;\n\n"
782  else :
783  str += " factorL = ( coord[4] > s ? -inv_d_n*pow(kappa,param.dc.Ls-coord[4]+s)*mferm : inv_d_n*pow(kappa,s-coord[4]))/2.0;\n\n"
784 
785  str += " o00_re += factorL*(i00_re - i20_re);\n"
786  str += " o00_im += factorL*(i00_im - i20_im);\n"
787  str += " o01_re += factorL*(i01_re - i21_re);\n"
788  str += " o01_im += factorL*(i01_im - i21_im);\n"
789  str += " o02_re += factorL*(i02_re - i22_re);\n"
790  str += " o02_im += factorL*(i02_im - i22_im);\n"
791  str += " o10_re += factorL*(i10_re - i30_re);\n"
792  str += " o10_im += factorL*(i10_im - i30_im);\n"
793  str += " o11_re += factorL*(i11_re - i31_re);\n"
794  str += " o11_im += factorL*(i11_im - i31_im);\n"
795  str += " o12_re += factorL*(i12_re - i32_re);\n"
796  str += " o12_im += factorL*(i12_im - i32_im);\n"
797  str += " o20_re += factorL*(i20_re - i00_re);\n"
798  str += " o20_im += factorL*(i20_im - i00_im);\n"
799  str += " o21_re += factorL*(i21_re - i01_re);\n"
800  str += " o21_im += factorL*(i21_im - i01_im);\n"
801  str += " o22_re += factorL*(i22_re - i02_re);\n"
802  str += " o22_im += factorL*(i22_im - i02_im);\n"
803  str += " o30_re += factorL*(i30_re - i10_re);\n"
804  str += " o30_im += factorL*(i30_im - i10_im);\n"
805  str += " o31_re += factorL*(i31_re - i11_re);\n"
806  str += " o31_im += factorL*(i31_im - i11_im);\n"
807  str += " o32_re += factorL*(i32_re - i12_re);\n"
808  str += " o32_im += factorL*(i32_im - i12_im);\n"
809  str += " }\n"
810  str += "} // end of M5inv dimension\n\n"
811 
812  return str
813 # end def gen_dw_inv
814 
815 
816 def input_spinor(s,c,z):
817  if dslash:
818  if z==0: return out_re(s,c)
819  else: return out_im(s,c)
820  else:
821  if z==0: return in_re(s,c)
822  else: return in_im(s,c)
823 
825  str = ""
826  str += "spinorFloat "+a_re(0,0,c)+" = -"+input_spinor(1,c,0)+" - "+input_spinor(3,c,0)+";\n"
827  str += "spinorFloat "+a_im(0,0,c)+" = -"+input_spinor(1,c,1)+" - "+input_spinor(3,c,1)+";\n"
828  str += "spinorFloat "+a_re(0,1,c)+" = "+input_spinor(0,c,0)+" + "+input_spinor(2,c,0)+";\n"
829  str += "spinorFloat "+a_im(0,1,c)+" = "+input_spinor(0,c,1)+" + "+input_spinor(2,c,1)+";\n"
830  str += "spinorFloat "+a_re(0,2,c)+" = -"+input_spinor(1,c,0)+" + "+input_spinor(3,c,0)+";\n"
831  str += "spinorFloat "+a_im(0,2,c)+" = -"+input_spinor(1,c,1)+" + "+input_spinor(3,c,1)+";\n"
832  str += "spinorFloat "+a_re(0,3,c)+" = "+input_spinor(0,c,0)+" - "+input_spinor(2,c,0)+";\n"
833  str += "spinorFloat "+a_im(0,3,c)+" = "+input_spinor(0,c,1)+" - "+input_spinor(2,c,1)+";\n"
834  str += "\n"
835 
836  for s in range (0,4):
837  str += out_re(s,c)+" = "+a_re(0,s,c)+"; "
838  str += out_im(s,c)+" = "+a_im(0,s,c)+";\n"
839 
840  return block(str)+"\n\n"
841 # end def to_chiral_basis
842 
843 
844 def from_chiral_basis(c): # note: factor of 1/2 is included in clover term normalization
845  str = ""
846  str += "spinorFloat "+a_re(0,0,c)+" = "+out_re(1,c)+" + "+out_re(3,c)+";\n"
847  str += "spinorFloat "+a_im(0,0,c)+" = "+out_im(1,c)+" + "+out_im(3,c)+";\n"
848  str += "spinorFloat "+a_re(0,1,c)+" = -"+out_re(0,c)+" - "+out_re(2,c)+";\n"
849  str += "spinorFloat "+a_im(0,1,c)+" = -"+out_im(0,c)+" - "+out_im(2,c)+";\n"
850  str += "spinorFloat "+a_re(0,2,c)+" = "+out_re(1,c)+" - "+out_re(3,c)+";\n"
851  str += "spinorFloat "+a_im(0,2,c)+" = "+out_im(1,c)+" - "+out_im(3,c)+";\n"
852  str += "spinorFloat "+a_re(0,3,c)+" = -"+out_re(0,c)+" + "+out_re(2,c)+";\n"
853  str += "spinorFloat "+a_im(0,3,c)+" = -"+out_im(0,c)+" + "+out_im(2,c)+";\n"
854  str += "\n"
855 
856  for s in range (0,4):
857  str += out_re(s,c)+" = "+a_re(0,s,c)+"; "
858  str += out_im(s,c)+" = "+a_im(0,s,c)+";\n"
859 
860  return block(str)+"\n\n"
861 # end def from_chiral_basis
862 
863 
864 def clover_mult(chi):
865  str = "READ_CLOVER(CLOVERTEX, "+`chi`+")\n\n"
866 
867  for s in range (0,2):
868  for c in range (0,3):
869  str += "spinorFloat "+a_re(chi,s,c)+" = 0; spinorFloat "+a_im(chi,s,c)+" = 0;\n"
870  str += "\n"
871 
872  for sm in range (0,2):
873  for cm in range (0,3):
874  for sn in range (0,2):
875  for cn in range (0,3):
876  str += a_re(chi,sm,cm)+" += "+c_re(chi,sm,cm,sn,cn)+" * "+out_re(2*chi+sn,cn)+";\n"
877  if (sn != sm) or (cn != cm):
878  str += a_re(chi,sm,cm)+" -= "+c_im(chi,sm,cm,sn,cn)+" * "+out_im(2*chi+sn,cn)+";\n"
879  #else: str += ";\n"
880  str += a_im(chi,sm,cm)+" += "+c_re(chi,sm,cm,sn,cn)+" * "+out_im(2*chi+sn,cn)+";\n"
881  if (sn != sm) or (cn != cm):
882  str += a_im(chi,sm,cm)+" += "+c_im(chi,sm,cm,sn,cn)+" * "+out_re(2*chi+sn,cn)+";\n"
883  #else: str += ";\n"
884  str += "\n"
885 
886  for s in range (0,2):
887  for c in range (0,3):
888  str += out_re(2*chi+s,c)+" = "+a_re(chi,s,c)+"; "
889  str += out_im(2*chi+s,c)+" = "+a_im(chi,s,c)+";\n"
890  str += "\n"
891 
892  return block(str)+"\n\n"
893 # end def clover_mult
894 
895 
897  if domain_wall: return ""
898  str = ""
899  if dslash: str += "#ifdef DSLASH_CLOVER\n\n"
900  str += "// change to chiral basis\n"
902  str += "// apply first chiral block\n"
903  str += clover_mult(0)
904  str += "// apply second chiral block\n"
905  str += clover_mult(1)
906  str += "// change back from chiral basis\n"
907  str += "// (note: required factor of 1/2 is included in clover term normalization)\n"
909  if dslash: str += "#endif // DSLASH_CLOVER\n\n"
910 
911  return str
912 # end def clover
913 
914 
915 def coeff():
916  if dslash4:
917  str = "coeff"
918  elif dslash5:
919  str = "coeff"
920  else :
921  str = "a"
922  return str
923 # end def coeff()
924 
925 def ypax():
926  str = ""
927  str += "#ifdef SPINOR_DOUBLE\n"
928 
929  for s in range(0,4):
930  for c in range(0,3):
931  i = 3*s+c
932  str += out_re(s,c) +" = "+out_re(s,c)+" + coeff*accum"+nthFloat2(2*i+0)+";\n"
933  str += out_im(s,c) +" = "+out_im(s,c)+" + coeff*accum"+nthFloat2(2*i+1)+";\n"
934 
935  str += "#else\n"
936 
937  for s in range(0,4):
938  for c in range(0,3):
939  i = 3*s+c
940  str += out_re(s,c) +" = "+out_re(s,c)+" + coeff*accum"+nthFloat4(2*i+0)+";\n"
941  str += out_im(s,c) +" = "+out_im(s,c)+" + coeff*accum"+nthFloat4(2*i+1)+";\n"
942 
943  str += "#endif // SPINOR_DOUBLE\n"
944  return str
945 # end def ypax
946 
947 def xpay():
948  str = ""
949  str += "#ifdef DSLASH_XPAY\n"
950  str += "READ_ACCUM(ACCUMTEX, param.sp_stride)\n"
951 
952  if dslash4:
953  str += "VOLATILE spinorFloat coeff;\n\n"
954  str += "#ifdef MDWF_mode\n"
955  str += "coeff = (spinorFloat)(0.5*a/(mdwf_b5[coord[4]]*(m5+4.0) + 1.0));\n"
956  str += "#else\n"
957  str += "coeff = a;\n"
958  str += "#endif\n\n"
959  elif dslash5:
960  str += "VOLATILE spinorFloat coeff;\n\n"
961  str += "#ifdef MDWF_mode\n"
962  str += "coeff = (spinorFloat)(0.5/(mdwf_b5[coord[4]]*(m5+4.0) + 1.0));\n"
963  str += "coeff *= -coeff;\n"
964  str += "#else\n"
965  str += "coeff = a;\n"
966  str += "#endif\n\n"
967  str += "#ifdef YPAX\n"
968  str += ypax()
969  str += "#else\n"
970 
971  str += "#ifdef SPINOR_DOUBLE\n"
972 
973  for s in range(0,4):
974  for c in range(0,3):
975  i = 3*s+c
976  str += out_re(s,c) +" = "+coeff()+"*"+out_re(s,c)+" + accum"+nthFloat2(2*i+0)+";\n"
977  str += out_im(s,c) +" = "+coeff()+"*"+out_im(s,c)+" + accum"+nthFloat2(2*i+1)+";\n"
978 
979  str += "#else\n"
980 
981  for s in range(0,4):
982  for c in range(0,3):
983  i = 3*s+c
984  str += out_re(s,c) +" = "+coeff()+"*"+out_re(s,c)+" + accum"+nthFloat4(2*i+0)+";\n"
985  str += out_im(s,c) +" = "+coeff()+"*"+out_im(s,c)+" + accum"+nthFloat4(2*i+1)+";\n"
986 
987  str += "#endif // SPINOR_DOUBLE\n"
988  if dslash5:
989  str += "#endif // YPAX\n"
990  str += "#endif // DSLASH_XPAY\n"
991 
992  return str
993 # end def xpay
994 
995 
996 def epilog():
997  str = ""
998  str += block( "\n" + ( apply_clover()) + xpay() )
999 
1000  str += "\n\n"
1001  str += "// write spinor field back to device memory\n"
1002  str += "WRITE_SPINOR(param.sp_stride);\n\n"
1003 
1004  str += "// undefine to prevent warning when precision is changed\n"
1005  str += "#undef m5\n"
1006  str += "#undef mdwf_b5\n"
1007  str += "#undef mdwf_c5\n"
1008  str += "#undef mferm\n"
1009  str += "#undef a\n"
1010  str += "#undef b\n"
1011  str += "#undef spinorFloat\n"
1012  str += "#undef SHARED_STRIDE\n\n"
1013 
1014  if dslash:
1015  if dslash4 == True:
1016  for m in range(0,3):
1017  for n in range(0,3):
1018  i = 3*m+n
1019  str += "#undef "+g_re(0,m,n)+"\n"
1020  str += "#undef "+g_im(0,m,n)+"\n"
1021  str += "\n"
1022 
1023  for s in range(0,4):
1024  for c in range(0,3):
1025  i = 3*s+c
1026  str += "#undef "+in_re(s,c)+"\n"
1027  str += "#undef "+in_im(s,c)+"\n"
1028  str += "\n"
1029 
1030  if clover == True:
1031  for m in range(0,6):
1032  s = m/3
1033  c = m%3
1034  str += "#undef "+c_re(0,s,c,s,c)+"\n"
1035  for n in range(0,6):
1036  sn = n/3
1037  cn = n%3
1038  for m in range(n+1,6):
1039  sm = m/3
1040  cm = m%3
1041  str += "#undef "+c_re(0,sm,cm,sn,cn)+"\n"
1042  str += "#undef "+c_im(0,sm,cm,sn,cn)+"\n"
1043  str += "\n"
1044 
1045  for s in range(0,4):
1046  for c in range(0,3):
1047  i = 3*s+c
1048  if 2*i < sharedFloats:
1049  str += "#undef "+out_re(s,c)+"\n"
1050  if 2*i+1 < sharedFloats:
1051  str += "#undef "+out_im(s,c)+"\n"
1052  str += "\n"
1053 
1054  str += "#undef VOLATILE\n"
1055  str += "#endif // MULTI_GPU\n"
1056 
1057  return str
1058 # end def epilog
1059 
1060 
1061 def pack_face(facenum):
1062  str = "\n"
1063  str += "switch(dim) {\n"
1064  for dim in range(0,4):
1065  str += "case "+`dim`+":\n"
1066  proj = gen(2*dim+facenum, pack_only=True)
1067  proj += "\n"
1068  proj += "// write half spinor back to device memory\n"
1069  proj += "WRITE_HALF_SPINOR(face_volume, face_idx);\n"
1070  str += indent(block(proj)+"\n"+"break;\n")
1071  str += "}\n\n"
1072  return str
1073 # end def pack_face
1074 
1076  assert (sharedFloats == 0)
1077  str = ""
1078  str += def_input_spinor()
1079  str += "#include \"io_spinor.h\"\n\n"
1080 
1081  str += "if (face_num) "
1082  str += block(pack_face(1))
1083  str += " else "
1084  str += block(pack_face(0))
1085 
1086  str += "\n\n"
1087  str += "// undefine to prevent warning when precision is changed\n"
1088  str += "#undef spinorFloat\n"
1089  str += "#undef SHARED_STRIDE\n\n"
1090 
1091  for s in range(0,4):
1092  for c in range(0,3):
1093  i = 3*s+c
1094  str += "#undef "+in_re(s,c)+"\n"
1095  str += "#undef "+in_im(s,c)+"\n"
1096  str += "\n"
1097 
1098  return str
1099 # end def generate_pack
1100 
1102  r = prolog()
1103  for i in range(0,8) :
1104  r += gen( i )
1105  r += epilog()
1106  return r
1107 
1109  r = prolog()
1110  for i in range(0,8) :
1111  r += gen( i )
1112  r += epilog()
1113  return r
1114 
1116  r = prolog()
1117  r += epilog()
1118  return r
1119 
1120 #def generate_dslash5D_inv():
1121 # r = prolog()
1122 # r += gen_dw_inv()
1123 # r += epilog()
1124 # return r
1125 
1126 #def generate_clover():
1127 # return prolog() + epilog()
1128 
1129 
1130 # To fit 192 threads/SM (single precision) with 16K shared memory, set sharedFloats to 19 or smaller
1131 
1132 sharedFloats = 0
1133 cloverSharedFloats = 0
1134 if(len(sys.argv) > 1):
1135  if (sys.argv[1] == '--shared'):
1136  sharedFloats = int(sys.argv[2])
1137 print "Shared floats set to " + str(sharedFloats);
1138 
1139 # generate Domain Wall Dslash kernels
1140 domain_wall = True
1141 clover = False
1142 dslash4 = True
1143 dslash5 = False
1144 normalDWF = False
1145 
1146 print sys.argv[0] + ": generating dw_fused_exterior_dslash4_core.h";
1147 dslash = True
1148 dagger = False
1149 f = open('dslash_core/dw_fused_exterior_dslash4_core.h', 'w')
1150 f.write(generate_dslash4D())
1151 f.close()
1152 
1153 print sys.argv[0] + ": generating dw_fused_exterior_dslash4_dagger_core.h";
1154 dslash = True
1155 dagger = True
1156 f = open('dslash_core/dw_fused_exterior_dslash4_dagger_core.h', 'w')
1157 f.write(generate_dslash4D())
1158 f.close()
1159 
1160 dslash4 = False
1161 dslash5 = True
1162 
1163 # these are not needed since these kernels are all local
1164 #print sys.argv[0] + ": generating dw_fused_exterior_dslash5_core.h";
1165 #dslash = True
1166 #dagger = False
1167 #f = open('dslash_core/dw_fused_exterior_dslash5_core.h', 'w')
1168 #f.write(generate_dslash5D())
1169 #f.close()
1170 
1171 #print sys.argv[0] + ": generating dw_fused_exterior_dslash5_dagger_core.h";
1172 #dslash = True
1173 #dagger = True
1174 #f = open('dslash_core/dw_fused_exterior_dslash5_dagger_core.h', 'w')
1175 #f.write(generate_dslash5D())
1176 #f.close()
1177 
1178 #dslash5 = False
1179 
1180 #print sys.argv[0] + ": generating dw_fused_exterior_dslash5inv_core.h";
1181 #dslash = True
1182 #dagger = False
1183 #f = open('dslash_core/dw_fused_exterior_dslash5inv_core.h', 'w')
1184 #f.write(generate_dslash5D_inv())
1185 #f.close()
1186 
1187 #print sys.argv[0] + ": generating dw_fused_exterior_dslash5inv_dagger_core.h";
1188 #dslash = True
1189 #dagger = True
1190 #f = open('dslash_core/dw_fused_exterior_dslash5inv_dagger_core.h', 'w')
1191 #f.write(generate_dslash5D_inv())
1192 #f.close()
1193 
Definition: gen.py:1
def complexify(a)
complex numbers ######################################################################## ...
if(err !=cudaSuccess)
def indent(code, n=1)
code generation ######################################################################## ...